niedziela, kwietnia 28, 2013

Java file names encoding

public static void main(String[] args) throws IOException {
Path p1 = Files.newDirectoryStream(Paths.get("/home/user/jdk/test"))
.iterator().next();
System.out.println("API 1.7: "+p1.toUri().getPath());
System.out.println("API 1.7: "+p1.toFile().getAbsolutePath());
String p2 = new java.io.File("/home/user/jdk/test").list()[0];
System.out.println("API 1.4: "+p2);
}

user@user-Aspire-4530:~/workspace/TestApp/bin$ LC_ALL="pl_PL.cp1250" java -Dfile.encoding=utf8 TestApp
API 1.7: /home/user/jdk/test/gżegżółka-ąęćśń
API 1.7: /home/user/jdk/test/gżegżółka-ąęćśń
API 1.4: g��eg������ka-����������
user@user-Aspire-4530:~/workspace/TestApp/bin$ LC_ALL="pl_PL.cp1250" java -Dfile.encoding=cp1250 TestApp
API 1.7: /home/user/jdk/test/g�eg���ka-����
API 1.7: /home/user/jdk/test/gżegżółka-ąęćśń
API 1.4: g??eg??????ka-??????????
user@user-Aspire-4530:~/workspace/TestApp/bin$ LC_ALL="pl_PL.utf-8" java -Dfile.encoding=cp1250 TestApp
API 1.7: /home/user/jdk/test/g�eg���ka-����
API 1.7: /home/user/jdk/test/gżegżółka-ąęćśń
API 1.4: g�eg���ka-����
user@user-Aspire-4530:~/workspace/TestApp/bin$ LC_ALL="en_US" java -Dfile.encoding=utf8 TestApp
API 1.7: /home/user/jdk/test/gżegżółka-ąęćśń
API 1.7: /home/user/jdk/test/gżegżółka-ąęćśń
API 1.4: g��eg������ka-����������
user@user-Aspire-4530:~/workspace/TestApp/bin$ LC_ALL="en_US" java TestApp
API 1.7: /home/user/jdk/test/g?eg???ka-?????
API 1.7: /home/user/jdk/test/g??eg??????ka-??????????
API 1.4: g??eg??????ka-??????????

user@user-Aspire-4530:~/workspace/TestApp/bin$ LC_ALL="pl_PL.utf8" java -Dfile.encoding=utf8 TestApp
API 1.7: /home/user/jdk/test/gżegżółka-ąęćśń
API 1.7: /home/user/jdk/test/gżegżółka-ąęćśń
API 1.4: gżegżółka-ąęćśń

Why there are differences?

sun.nio.fs.UnixDirectoryStreamIterator creates String from bytes with default JVM encoding:

private Path readNextEntry() {
            assert Thread.holdsLock(this);

            for (;;) {
                byte[] nameAsBytes = null;

                // prevent close while reading
                readLock().lock();
                try {
                    if (isOpen()) {
                        nameAsBytes = readdir(dp);
                    }
                } catch (UnixException x) {
                    IOException ioe = x.asIOException(dir);
                    throw new DirectoryIteratorException(ioe);
                } finally {
                    readLock().unlock();
                }
...

File.list() is a native C/C++ method (suprise!):

JNIEXPORT jobjectArray JNICALL
Java_java_io_UnixFileSystem_list(JNIEnv *env, jobject this,
                                 jobject file)
{
    DIR *dir = NULL;
    struct dirent64 *ptr;
    struct dirent64 *result;
    int len, maxlen;
    jobjectArray rv, old;

    WITH_FIELD_PLATFORM_STRING(env, file, ids.path, path) {
        dir = opendir(path);
    } END_PLATFORM_STRING(env, path);
    if (dir == NULL) return NULL;

    ptr = malloc(sizeof(struct dirent64) + (PATH_MAX + 1));
    if (ptr == NULL) {
        JNU_ThrowOutOfMemoryError(env, "heap allocation failed");
        closedir(dir);
        return NULL;
    }

    /* Allocate an initial String array */
    len = 0;
    maxlen = 16;
    rv = (*env)->NewObjectArray(env, maxlen, JNU_ClassString(env), NULL);
    if (rv == NULL) goto error;

    /* Scan the directory */
    while ((readdir64_r(dir, ptr, &result) == 0)  && (result != NULL)) {
        jstring name;
        if (!strcmp(ptr->d_name, ".") || !strcmp(ptr->d_name, ".."))
            continue;
        if (len == maxlen) {
            old = rv;
            rv = (*env)->NewObjectArray(env, maxlen <<= 1,
                                        JNU_ClassString(env), NULL);
            if (rv == NULL) goto error;
            if (JNU_CopyObjectArray(env, rv, old, len) < 0) goto error;
            (*env)->DeleteLocalRef(env, old);
        }
        name = JNU_NewStringPlatform(env, ptr->d_name);
...

JNIEXPORT jstring JNICALL
JNU_NewStringPlatform(JNIEnv *env, const char *str)
{
    jstring result;
    result = nativeNewStringPlatform(env, str);
    if (result == NULL) {
        jbyteArray hab = 0;
        int len;

        if (fastEncoding == NO_ENCODING_YET)
            initializeEncoding(env);

        if ((fastEncoding == FAST_8859_1) || (fastEncoding == NO_ENCODING_YET))
            return newString8859_1(env, str);
        if (fastEncoding == FAST_646_US)
            return newString646_US(env, str);
        if (fastEncoding == FAST_CP1252)
            return newStringCp1252(env, str);

        if ((*env)->EnsureLocalCapacity(env, 2) < 0)
            return NULL;

        len = (int)strlen(str);
        hab = (*env)->NewByteArray(env, len);
        if (hab != 0) {
            (*env)->SetByteArrayRegion(env, hab, 0, len, (jbyte *)str);
            if (jnuEncodingSupported(env)) {
                result = (*env)->NewObject(env, JNU_ClassString(env),
                                           String_init_ID, hab, jnuEncoding);
            } else {
                /*If the encoding specified in sun.jnu.encoding is not endorsed
                  by "Charset.isSupported" we have to fall back to use String(byte[])
                  explicitly here without specifying the encoding name, in which the
                  StringCoding class will pickup the iso-8859-1 as the fallback
                  converter for us.
                 */
                jmethodID mid = (*env)->GetMethodID(env, JNU_ClassString(env),
                                                    "", "([B)V");
                result = (*env)->NewObject(env, JNU_ClassString(env), mid, hab);
            }
            (*env)->DeleteLocalRef(env, hab);
            return result;
        }
    }
    return NULL;
}

Conclusions: When you get file names of everything other than native filesystem, you may have damaged strings. It they are not damaged, but in different UTF-8 representations you can translate them with Normalizer.

0 komentarze: