001/* 002 * Copyright 2007 Marc Wick, geonames.org 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 */ 017package org.geonames.wikipedia; 018 019/** 020 * @author marc 021 * 022 */ 023public class TextSummaryExtractor { 024 025 /** 026 * extract a text summary from a wikipedia article. The wikipedia markup is 027 * removed. The length of the summary is equal or lower the length of the 028 * parameter. The extractor tries to end the summary at a fullstop. It stops 029 * at a new paragraph. 030 * 031 * @param pText 032 * @param length 033 * @return 034 */ 035 public static String extractSummary(String pText, int length) { 036 return extractSummary(pText, length, true); 037 } 038 039 public static String extractSummary(String pText, int length, 040 boolean stopAtParagraph) { 041 if (pText == null) { 042 return null; 043 } 044 045 // remove all wikipedia markup (paragraphs are kept) 046 // 047 StringBuilder summary = new StringBuilder(); 048 int idx = 0; 049 050 // loop over all characters in input string 051 while (idx > -1 && (summary.length() < 50 + 2 * length || length == 0) 052 && idx < pText.length()) { 053 // get next chacter 054 char c = pText.charAt(idx); 055 056 if (c == '{') { 057 // skip template and set idx to end of template 058 int endidx = skipTemplate(pText, idx); 059 // do we have an audio template? 060 if (pText.toLowerCase().indexOf("{{audio") == idx) { 061 int begLabelIdx = pText.lastIndexOf("|", endidx); 062 if (begLabelIdx > -1) { 063 String label = pText.substring(begLabelIdx + 1, 064 endidx - 2).trim(); 065 summary.append(label); 066 } 067 } 068 if (pText.toLowerCase().indexOf("{{formatnum") == idx) { 069 int begLabelIdx = pText.indexOf(":", idx); 070 if (begLabelIdx > -1) { 071 072 int endLabelIdx = endidx - 2; 073 if (pText.indexOf("|", begLabelIdx) > -1) { 074 endLabelIdx = pText.indexOf("|", begLabelIdx); 075 } 076 077 String label = pText.substring(begLabelIdx + 1, 078 endLabelIdx).trim(); 079 summary.append(label); 080 } 081 } 082 // skip template and set idx to end of template 083 idx = endidx; 084 continue; 085 } else if (c == '<') { 086 // is it a html comment 087 if (pText.length() > idx + 1 && pText.charAt(idx + 1) == '!') { 088 // skip html comment 089 idx = skipHTMLComment(pText, idx); 090 continue; 091 } else { 092 // html element starts here, skip it, set idx to end of html 093 // element 094 idx = skipHTMLElement(pText, idx); 095 continue; 096 } 097 } else if (c == '[') { 098 099 // look ahead to see whether we have a link 100 if (pText.charAt(idx + 1) == '[') { 101 // we have two square brackets "[[" (link) 102 103 // get the end of the double square bracket 104 int endOfLink = pText.indexOf("]]", idx); 105 106 // image link ? 107 int colon = pText.indexOf(":", idx); 108 if (colon > -1 && colon < endOfLink) { 109 // image link contains a caption which might contain 110 // a link within the link 111 idx = findEndOfLink(pText, idx); 112 continue; 113 } 114 115 int beginAnchor = pText.indexOf("|", idx); 116 if (beginAnchor > -1 && beginAnchor < endOfLink) { 117 idx = beginAnchor + 1; 118 } else { 119 idx = idx + 2; 120 } 121 continue; 122 } else { 123 // next character is not a square brackets and thus a 124 // reference link to be removed 125 // get the end of the square bracket 126 int endOfLink = pText.indexOf("]", idx); 127 if (endOfLink > -1) { 128 idx = endOfLink + 1; 129 continue; 130 } 131 } 132 } else if (c == ']') { 133 // look ahead 134 if (idx + 1 < pText.length() && pText.charAt(idx + 1) == ']') { 135 idx = idx + 2; 136 continue; 137 } 138 } else if (c == '=') { 139 // look ahead 140 if (idx + 1 < pText.length() && pText.charAt(idx + 1) == '=') { 141 int endHeaderIdx = pText.indexOf("==", idx+2); 142 if (endHeaderIdx > -1) { 143 idx = endHeaderIdx + 2; 144 } 145 continue; 146 } 147 } 148 149 summary.append(c); 150 idx++; 151 } 152 153 String textString = removeIndentAtBeginning(summary.toString()); 154 // remove empty parenthesis 155 textString = textString.replaceAll("\\([^\\w]*\\)", ""); 156 // remove comma in front of parenthesis 157 textString = textString.replaceAll("\\([, ]*", "("); 158 textString = textString.replaceAll("[, ]*\\)", ")"); 159 160 textString = removeWhiteSpace( 161 textString.replaceAll("\r", " ").replaceAll("\n", " ") 162 .replaceAll("\t", " ")).trim(); 163 textString = removeBold(textString); 164 textString = removeItalic(textString); 165 166 // convert 'non breaking html spaces' into blanks. But preserve them 167 // (don't remove white space) 168 textString = textString.replaceAll(" ", " "); 169 textString = textString.replaceAll("\\( ", "("); 170 textString = textString.replaceAll(" \\)", ")"); 171 172 // find full stop near length of text 173 int endOfTextIdx = textString.length(); 174 175 if (stopAtParagraph) { 176 // only look at first paragraph for summary 177 int paragraph = textString.indexOf("=="); 178 if (paragraph > 10) { 179 endOfTextIdx = paragraph; 180 } 181 } 182 183 // 184 if (endOfTextIdx < 20 || endOfTextIdx > length) { 185 endOfTextIdx = textString.lastIndexOf(".", length); 186 if (endOfTextIdx < 0.7 * length) { 187 endOfTextIdx = textString.lastIndexOf(" ", length); 188 } 189 } 190 191 // add elipsis if we have shortened the article 192 if (endOfTextIdx > -1 && endOfTextIdx < textString.length()) { 193 textString = textString.substring(0, endOfTextIdx) + " (...)"; 194 } 195 196 // trim trailing spaces and return 197 return textString.trim(); 198 } 199 200 /** 201 * skips templates in wikipedia markup. Templates are enclosed within braces 202 * {}. There might be nested templates within an other template. 203 * 204 * @param pText 205 * : the wikipedia text with templates 206 * @param pIdx 207 * , pos in text to start with, MUST be a { 208 * @return the idx into the text where the template ends, or the last 209 * character in the text if it does not properly end. 210 */ 211 static int skipTemplate(String pText, int pIdx) { 212 // make sure we start with opening braces 213 if (pText.charAt(pIdx) != '{') { 214 return pIdx; 215 } 216 217 // counter for the braces we have opened, braces might be recursive 218 // we use an iterative implementation, since it is a tiny little bit 219 // faster 220 int numOpenings = 1; 221 // start with the next character 222 int idx = pIdx + 1; 223 // loop over the text starting from the next character till the end of 224 // the template or the end of the text 225 while (numOpenings > 0 && pText.length() > idx) { 226 if (pText.charAt(idx) == '{') { 227 numOpenings++; 228 } else if (pText.charAt(idx) == '}') { 229 numOpenings--; 230 } 231 idx++; 232 } 233 return idx; 234 } 235 236 /** 237 * @param pText 238 * @param pIdx 239 * , pos in text to start with, MUST be a { 240 * @return 241 */ 242 static int skipHTMLElement(String pText, int pIdx) { 243 if (pText.charAt(pIdx) != '<') { 244 return pIdx; 245 } 246 247 int numOpenings = 1; 248 int idx = pIdx + 1; 249 while (numOpenings > 0 && pText.length() > idx) { 250 if (pText.charAt(idx) == '<') { 251 numOpenings++; 252 } else if (pText.charAt(idx) == '>') { 253 numOpenings--; 254 } 255 idx++; 256 } 257 return idx; 258 } 259 260/** 261 * @param pText 262 * @param pIdx, 263 * pos in text to start with, MUST be a '<' 264 * @return 265 */ 266 static int skipHTMLComment(String pText, int pIdx) { 267 if (pText.charAt(pIdx) != '<' && pText.charAt(pIdx + 1) != '!') { 268 return pIdx; 269 } 270 271 int idx = pIdx; 272 while ((idx = pText.indexOf('-', idx)) > -1) { 273 if (pText.length() < idx + 2) { 274 return pText.length(); 275 } 276 if (pText.charAt(idx) == '-' && pText.charAt(idx + 1) == '-' 277 && pText.charAt(idx + 2) == '>') { 278 return idx + 3; 279 } 280 idx++; 281 } 282 return idx; 283 } 284 285 private static String removeIndentAtBeginning(String pText) { 286 pText = pText.trim(); 287 if (pText.startsWith(":")) { 288 int lineFeed = pText.indexOf("\n"); 289 if (lineFeed > -1) { 290 pText = pText.substring(lineFeed + 1); 291 } else { 292 // we may already have removed the linefeed 293 // check for italics 294 if (pText.startsWith(":''")) { 295 int italic = pText.indexOf("''", 3); 296 if (italic > -1) { 297 pText = pText.substring(italic + 2); 298 } 299 } 300 } 301 } 302 return pText; 303 } 304 305 private static int findEndOfLink(String pText, int pIdx) { 306 int end = pText.indexOf("]]", pIdx); 307 if (end == -1) { 308 return pIdx; 309 } 310 311 int idx = pIdx; 312 int openingIdx = pText.indexOf("[[", idx + 2); 313 while (openingIdx > -1 && openingIdx < end) { 314 idx = end; 315 end = pText.indexOf("]]", end + 2); 316 openingIdx = pText.indexOf("[[", idx); 317 } 318 if (end != -1) { 319 idx = end; 320 } 321 return idx; 322 } 323 324 /** 325 * removes sequences of whitespace and keeps only one whitespace character 326 * 327 * @param pString 328 * @return 329 */ 330 public static String removeWhiteSpace(String pString) { 331 StringBuffer buf = new StringBuffer(); 332 char[] chars = pString.toCharArray(); 333 int counter = 0; 334 for (int i = 0; i < chars.length; i++) { 335 if (chars[i] == ' ') { 336 if (counter == 0) { 337 buf.append(chars[i]); 338 } 339 counter++; 340 } else { 341 buf.append(chars[i]); 342 counter = 0; 343 } 344 } 345 return buf.toString(); 346 } 347 348 public static String removeBold(String pString) { 349 return pString.replaceAll("'''", ""); 350 } 351 352 public static String removeItalic(String pString) { 353 return pString.replaceAll("''", ""); 354 } 355 356}