poniedziałek, grudnia 10, 2012

Wykrywanie kodowania znaków w requescie HTTP

String acceptCharset = request.getHeader(ACCEPT_CHARSET);
Charset charset = getBestCharset(acceptCharset!=null ? acceptCharset :
 getCharsetFromContentType(request.getHeader(CONTENT_TYPE), content));
  
    private static Charset getBestCharset(String acceptCharset) {
        if (acceptCharset==null)
            return CharsetUtil.UTF_8;
        StringTokenizer st = new StringTokenizer(acceptCharset, ",");
        while (st.hasMoreTokens()) {
            try {
                return Charset.forName(st.nextToken().trim());
            }
            catch (Exception exc) {};
        }
        return CharsetUtil.UTF_8;
    }
 
    private static String getCharsetFromContentType(String ct, ChannelBuffer data) {
        ct = ct!=null ? ct.toLowerCase() : "";
        int charsetPos = ct.indexOf("charset=");
        if (charsetPos!=-1) {
            for (int i=charsetPos+8; i < ct.length(); i++) {
                if (!Character.isLetterOrDigit(ct.charAt(i))) {
                    return ct.substring(charsetPos+8, i);
                }
            }
        }
        if (data!=null) {
            int win1250_score = 0;
            int iso8859_2_score = 0;
            for (int i=0; i < data.readableBytes(); i++) {
                int b = (data.getByte(i) & 0xff);
                if (b == '%') {
                    if (i+2 < data.readableBytes()) {
                        try {
                            b = Integer.rotateLeft(getHex(data.getByte(i+1)), 4) + getHex(data.getByte(i+2));                           
                        }
                        catch (Exception parseExc) {}
                    }
                }
                for (int k=0; k < WIN1250.length; k++) {
                    if (b == WIN1250[k])
                        win1250_score++;
                    else if (b == ISO_8859_2[k])
                        iso8859_2_score++;
                }
            }
            System.out.println(win1250_score+"/"+iso8859_2_score);
            if (win1250_score > 0 && win1250_score >= iso8859_2_score)
                return "windows-1250";
            if (iso8859_2_score > 0 && iso8859_2_score >= win1250_score)
                return "iso-8859-2";
        }
        return null;
    }
 
    private final static int getHex(byte b) {
        int c = b & 0xff;
        if (c >= '0' && c <= '9')
            return c-'0';
        if (c >= 'a' && c <= 'f')
            return 10+c-'a';
        if (c >= 'A' && c <= 'F')
            return 10+c-'A';
        throw new RuntimeException("Char "+c+" is not valid hex");
    }
 
    /* ąśźĄŚŹ */
    private final static int WIN1250[] = { 185,156,159,165,140,143 };
    private final static int ISO_8859_2[] = { 177,182,188,161,166,172 };

0 komentarze: