public static String extractTextFromHtml(String html, boolean isForJson) { StringBuilder sb = new StringBuilder(); boolean ignore = false; boolean isXmlData = false; boolean closing = false; for (int i=0; i < html.length(); i++) { char c = html.charAt(i); if (c=='<' & !ignore) { ignore = true; closing = false; } else if (c=='&' && !ignore) { int end = html.indexOf(";", i+1); if (end!=-1) { Boolean isNumber = null; Boolean isEntity = null; for (int x=i+1; x < end; x++) { char ch = html.charAt(x); if ((ch >='0' && ch <= '9') || (ch=='#' && x==c+1)) { if (isNumber == null) isNumber = true; else isNumber &= true; } else if (ch==' ') { if (isEntity==null) isEntity = false; else isEntity = isNumber = false; } else isEntity = true; } if (Boolean.TRUE.equals(isNumber) || Boolean.TRUE.equals(isEntity)) { if (Boolean.TRUE.equals(isNumber)) { String num = html.substring(i+2, end); sb.append(Character.toString((char)Integer.valueOf(num).intValue())); } int add = (end - i); i += add; } else if (ignore && c=='/') { closing = true; } else if (ignore && c=='>') { ignore = false; if (closing && isXmlData) closing = isXmlData = false; } else if (ignore && c==':') { isXmlData = true; } else if (!ignore && !isXmlData) { if (c=='„' || c=='”' && isForJson) c = '"'; if (c=='"' && i>1 && html.charAt(i-1)!='\\' && isForJson) sb.append('\\'); sb.append(c); } } String s = sb.toString().trim(); if (s.startsWith("ĘĄ")) return s.substring(2, s.length()); return s; }
środa, września 14, 2016
How to extract text from MS HTML
Subskrybuj:
Komentarze do posta (Atom)
0 komentarze:
Prześlij komentarz