środa, września 14, 2016

How to extract text from MS HTML

public static String extractTextFromHtml(String html, boolean isForJson) {
StringBuilder sb = new StringBuilder();
boolean ignore = false; 
boolean isXmlData = false;
boolean closing = false;
      
for (int i=0; i < html.length(); i++) {
char c = html.charAt(i);
if (c=='<' & !ignore) {
 ignore = true;
 closing = false;
}
else if (c=='&' && !ignore) {
 int end = html.indexOf(";", i+1);    
 if (end!=-1) {
  Boolean isNumber = null;
  Boolean isEntity = null;
  for (int x=i+1; x < end; x++) {
   char ch = html.charAt(x);
   if ((ch >='0' && ch <= '9') || (ch=='#' && x==c+1)) {
    if (isNumber == null)
     isNumber = true;
    else
     isNumber &= true;
   }
   else if (ch==' ') {
    if (isEntity==null)
     isEntity = false;
    else
     isEntity = isNumber = false;
   }
   else
    isEntity = true;
  }
  if (Boolean.TRUE.equals(isNumber) || Boolean.TRUE.equals(isEntity)) {       
   if (Boolean.TRUE.equals(isNumber)) {
   String num = html.substring(i+2, end);
   sb.append(Character.toString((char)Integer.valueOf(num).intValue()));       
          }
          int add = (end - i);
   i += add;
         }     
  else if (ignore && c=='/') {
   closing = true;
  }
  else if (ignore && c=='>') {
   ignore = false;
   if (closing && isXmlData)
    closing = isXmlData = false;
  }
  else if (ignore && c==':') {
   isXmlData = true;
  }
  else if (!ignore && !isXmlData) {
   if (c=='„' || c=='”' && isForJson)
    c = '"';
   if (c=='"' && i>1 && html.charAt(i-1)!='\\' && isForJson)
    sb.append('\\');      
   sb.append(c);    
  }
 }
 String s = sb.toString().trim();
 if (s.startsWith("ĘĄ"))
  return s.substring(2, s.length());
 return s;
}

sobota, września 10, 2016

Ewolucja bakterii

http://www.npr.org/sections/health-shots/2016/09/08/492965889/watch-bacteria-invade-antibiotics-and-transform-into-superbugs

piątek, września 09, 2016

Lądowanie w Modlinie z GPS-em