Re^3: unicode version of readdir

in reply to Re^2: unicode version of readdir
in thread unicode version of readdir

The problem with this particular issue (as ive said elsewhere) is that the interface for the routines (which ARE completely pluggable) does not include a way to pass the fact that the strings are unicode back to the calling code. They are all based around crude UNIX style char * interfaces.

So in this case I wouldnt expect the kind of thing you are referring to to come up, it will just be such a big job with such huge ramifications that it wont happen until 5.12 at least. :-(

I guess im one of the tame win32 users. Although anybody that knows me well knows that 'tame' is not the best description. ;-)

---
$world=~s/war/peace/g

Comment on Re^3: unicode version of readdir

Replies are listed 'Best First'.
Re^4: unicode version of readdir by dk (Chaplain) on Sep 18, 2007 at 05:23 UTC
Rejoice! I've found an easy way to pass that info. I'm working on the implementation, will post to p5p when ready, but the idea is to add `int PL_dir_unicode` that will contain set of unicode flags. The idea is that syscalls that are unaware of these hints, will keep the default behavior, and new code will differentiate between bytes and unicode semantics. The new unicode semantics is proposed along, that when compiled in has the following consequences. A) filename functions (as open, stat etc) have a chance to behave differently when passed unicode scalars b) after `binmode(DIRHANDLE, ':utf8')`, readdir will return unicode scalars, if appropriate. An interesting consequence of that would be that even on unicode-unaware OSes, readdir() will also return unicode scalars, without touching any system-specific code -- which I think is really cool. The prototype is: #define DIRf_HINT_WANT_UTF8_RESULT 1 #define DIRf_HINT_PARAM1_IS_UTF8 2 #define DIRf_HINT_PARAM2_IS_UTF8 4 #define DIRf_RESULT_IS_UTF8 8 #define DIRf_RESULT_IS_BYTES 16 #ifdef UTF8_FILENAME_SEMANTICS #define SET_DIR_UTF8_HINTS(flags) PL_dir_unicode = (flags) #define isDIR_RESULT_WANTED_AS_UTF8 ((PL_dir_unicode) & DIRf_HINT_W +ANT_UTF8_RESULT) #define isDIR_PARAM_UTF8 ((PL_dir_unicode) & DIRf_HINT_PARAM1_I +S_UTF8) #define isDIR_PARAM2_UTF8 ((PL_dir_unicode) & DIRf_HINT_PARAM2_ +IS_UTF8) #define PERLIO_UTF8_CHECK_RESULT(sv) \ if ( PL_dir_unicode & DIRf_RESULT_IS_UTF8) { \ SvUTF8_on((sv)); \ } else if ( !(PL_dir_unicode & DIRf_RESULT_IS_BYTES)) { \ STRLEN len; \ const char * const s = SvPV(sv,len); \ if (is_utf8_string((const U8)s,len)) { \ SvUTF8_on((sv)); \ } \ } \ #else #define SET_DIR_UTF8_HINTS(flags) #define isDIR_RESULT_WANTED_AS_UTF8 0 #define isDIR_PARAM_UTF8 0 #define isDIR_PARAM2_UTF8 0 #define PERLIO_UTF8_CHECK_RESULT(sv) #endif #define PERLIO_UTF8_CONTEXT(u1) \ SET_DIR_UTF8_HINTS((u1) ? DIRf_HINT_PARAM1_IS_UTF8 : 0) #define PERLIO_UTF8_CONTEXT2(u1,u2) \ SET_DIR_UTF8_HINTS( \ ((u1) ? DIRf_HINT_PARAM1_IS_UTF8 : 0) \| \ ((u2) ? DIRf_HINT_PARAM2_IS_UTF8 : 0)) #define PERLIO_UTF8_CONTEXT_FROM_SV(sv) \ PERLIO_UTF8_CONTEXT(SvUTF8(sv)) #define PERLIO_UTF8_CONTEXT_FROM_SV2(sv1,sv2) \ PERLIO_UTF8_CONTEXT(SvUTF8(sv1),SvUTF8(sv2)) #define PERLIO_UTF8_CONTEXT_RETURN(ret) \ SET_DIR_UTF8_HINTS((ret) ? DIRf_RESULT_IS_UTF8 : 0) #define PERLIO_UTF8_CLEAR_CONTEXT \ SET_DIR_UTF8_HINTS(0) [download] caller code: (pp_stat, for example): `PERLIO_UTF8_CONTEXT_FROM_SV(sv); if (PL_op->op_type == OP_LSTAT) PL_laststatval = PerlLIO_lstat(SvPV_nolen_const(PL_statname), +&PL_statcache); else PL_laststatval = PerlLIO_stat(SvPV_nolen_const(PL_statname), & +PL_statcache); PERLIO_UTF8_CLEAR_CONTEXT;` [download] and implementation of win32_stat in win32.c: `BOOL do_utf8 = isDIR_PARAM_UTF8 && IsWin2000(); ... if ( do_utf8) { WCHAR buf[MAX_PATH+1]; l = MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, MAX_PATH+1) +; path = (char) PerlDir_mapW(buf); } else { path = PerlDir_mapA(path); l = strlen(path); } ... res = do_utf8 ? wstat(( WCHAR*) path, sbuf) : stat(path, sbuf);` [download] This way the new functionality won't have any effect on platforms without utf8 filenames. Early criticism is welcome.	[reply] [d/l] [select]

Replies are listed 'Best First'.

Re^4: unicode version of readdir
by dk (Chaplain) on Sep 18, 2007 at 05:23 UTC

int PL_dir_unicode

The new unicode semantics is proposed along, that when compiled in has the following consequences. A) filename functions (as open, stat etc) have a chance to behave differently when passed unicode scalars b) after binmode(DIRHANDLE, ':utf8'), readdir will return unicode scalars, if appropriate. An interesting consequence of that would be that even on unicode-unaware OSes, readdir() will also return unicode scalars, without touching any system-specific code -- which I think is really cool.

The prototype is:

#define DIRf_HINT_WANT_UTF8_RESULT     1
#define DIRf_HINT_PARAM1_IS_UTF8       2
#define DIRf_HINT_PARAM2_IS_UTF8       4
#define DIRf_RESULT_IS_UTF8            8
#define DIRf_RESULT_IS_BYTES          16

#ifdef UTF8_FILENAME_SEMANTICS

#define SET_DIR_UTF8_HINTS(flags)    PL_dir_unicode = (flags)

#define isDIR_RESULT_WANTED_AS_UTF8    ((PL_dir_unicode) & DIRf_HINT_W
+ANT_UTF8_RESULT)
#define isDIR_PARAM_UTF8        ((PL_dir_unicode) & DIRf_HINT_PARAM1_I
+S_UTF8)
#define isDIR_PARAM2_UTF8        ((PL_dir_unicode) & DIRf_HINT_PARAM2_
+IS_UTF8)

#define PERLIO_UTF8_CHECK_RESULT(sv) \
    if ( PL_dir_unicode & DIRf_RESULT_IS_UTF8) {        \
    SvUTF8_on((sv));                    \
    } else if ( !(PL_dir_unicode & DIRf_RESULT_IS_BYTES)) {    \
    STRLEN len;                        \
    const char * const s = SvPV(sv,len);            \
    if (is_utf8_string((const U8*)s,len)) {            \
        SvUTF8_on((sv));                    \
    }                            \
    }                                \

#else

#define SET_DIR_UTF8_HINTS(flags)

#define isDIR_RESULT_WANTED_AS_UTF8    0
#define isDIR_PARAM_UTF8        0
#define isDIR_PARAM2_UTF8        0

#define PERLIO_UTF8_CHECK_RESULT(sv) 

#endif

#define PERLIO_UTF8_CONTEXT(u1)                \
    SET_DIR_UTF8_HINTS((u1) ?  DIRf_HINT_PARAM1_IS_UTF8 : 0)
#define PERLIO_UTF8_CONTEXT2(u1,u2)            \
    SET_DIR_UTF8_HINTS(                \
        ((u1) ? DIRf_HINT_PARAM1_IS_UTF8 : 0) |    \
        ((u2) ? DIRf_HINT_PARAM2_IS_UTF8 : 0))
#define PERLIO_UTF8_CONTEXT_FROM_SV(sv)            \
    PERLIO_UTF8_CONTEXT(SvUTF8(sv))
#define PERLIO_UTF8_CONTEXT_FROM_SV2(sv1,sv2)        \
    PERLIO_UTF8_CONTEXT(SvUTF8(sv1),SvUTF8(sv2))
#define PERLIO_UTF8_CONTEXT_RETURN(ret)            \
    SET_DIR_UTF8_HINTS((ret) ?  DIRf_RESULT_IS_UTF8 : 0)
#define PERLIO_UTF8_CLEAR_CONTEXT                       \
    SET_DIR_UTF8_HINTS(0)
[download]

caller code: (pp_stat, for example):

    PERLIO_UTF8_CONTEXT_FROM_SV(sv);
    if (PL_op->op_type == OP_LSTAT)
        PL_laststatval = PerlLIO_lstat(SvPV_nolen_const(PL_statname), 
+&PL_statcache);
    else
        PL_laststatval = PerlLIO_stat(SvPV_nolen_const(PL_statname), &
+PL_statcache);
    PERLIO_UTF8_CLEAR_CONTEXT;
[download]

and implementation of win32_stat in win32.c:

BOOL    do_utf8 = isDIR_PARAM_UTF8 && IsWin2000();
...
    if ( do_utf8) {
        WCHAR buf[MAX_PATH+1];
        l = MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, MAX_PATH+1)
+;
        path = (char*) PerlDir_mapW(buf);
    } else {
        path = PerlDir_mapA(path);
        l = strlen(path);
    }
...
    res = do_utf8 ?
       wstat(( WCHAR*) path, sbuf) :
       stat(path, sbuf);
[download]

[reply]
[d/l]
[select]

In Section Seekers of Perl Wisdom