Source code

001/*
002 * Copyright 2007 Marc Wick, geonames.org
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 */
017package org.geonames.wikipedia;
018
019/**
020 * @author marc
021 * 
022 */
023public class TextSummaryExtractor {
024
025        /**
026         * extract a text summary from a wikipedia article. The wikipedia markup is
027         * removed. The length of the summary is equal or lower the length of the
028         * parameter. The extractor tries to end the summary at a fullstop. It stops
029         * at a new paragraph.
030         * 
031         * @param pText
032         * @param length
033         * @return
034         */
035        public static String extractSummary(String pText, int length) {
036                return extractSummary(pText, length, true);
037        }
038
039        public static String extractSummary(String pText, int length,
040                        boolean stopAtParagraph) {
041                if (pText == null) {
042                        return null;
043                }
044
045                // remove all wikipedia markup (paragraphs are kept)
046                //
047                StringBuilder summary = new StringBuilder();
048                int idx = 0;
049
050                // loop over all characters in input string
051                while (idx > -1 && (summary.length() < 50 + 2 * length || length == 0)
052                                && idx < pText.length()) {
053                        // get next chacter
054                        char c = pText.charAt(idx);
055
056                        if (c == '{') {
057                                // skip template and set idx to end of template
058                                int endidx = skipTemplate(pText, idx);
059                                // do we have an audio template?
060                                if (pText.toLowerCase().indexOf("{{audio") == idx) {
061                                        int begLabelIdx = pText.lastIndexOf("|", endidx);
062                                        if (begLabelIdx > -1) {
063                                                String label = pText.substring(begLabelIdx + 1,
064                                                                endidx - 2).trim();
065                                                summary.append(label);
066                                        }
067                                }
068                                if (pText.toLowerCase().indexOf("{{formatnum") == idx) {
069                                        int begLabelIdx = pText.indexOf(":", idx);
070                                        if (begLabelIdx > -1) {
071
072                                                int endLabelIdx = endidx - 2;
073                                                if (pText.indexOf("|", begLabelIdx) > -1) {
074                                                        endLabelIdx = pText.indexOf("|", begLabelIdx);
075                                                }
076
077                                                String label = pText.substring(begLabelIdx + 1,
078                                                                endLabelIdx).trim();
079                                                summary.append(label);
080                                        }
081                                }
082                                // skip template and set idx to end of template
083                                idx = endidx;
084                                continue;
085                        } else if (c == '<') {
086                                // is it a html comment
087                                if (pText.length() > idx + 1 && pText.charAt(idx + 1) == '!') {
088                                        // skip html comment
089                                        idx = skipHTMLComment(pText, idx);
090                                        continue;
091                                } else {
092                                        // html element starts here, skip it, set idx to end of html
093                                        // element
094                                        idx = skipHTMLElement(pText, idx);
095                                        continue;
096                                }
097                        } else if (c == '[') {
098
099                                // look ahead to see whether we have a link
100                                if (pText.charAt(idx + 1) == '[') {
101                                        // we have two square brackets "[[" (link)
102
103                                        // get the end of the double square bracket
104                                        int endOfLink = pText.indexOf("]]", idx);
105
106                                        // image link ?
107                                        int colon = pText.indexOf(":", idx);
108                                        if (colon > -1 && colon < endOfLink) {
109                                                // image link contains a caption which might contain
110                                                // a link within the link
111                                                idx = findEndOfLink(pText, idx);
112                                                continue;
113                                        }
114
115                                        int beginAnchor = pText.indexOf("|", idx);
116                                        if (beginAnchor > -1 && beginAnchor < endOfLink) {
117                                                idx = beginAnchor + 1;
118                                        } else {
119                                                idx = idx + 2;
120                                        }
121                                        continue;
122                                } else {
123                                        // next character is not a square brackets and thus a
124                                        // reference link to be removed
125                                        // get the end of the square bracket
126                                        int endOfLink = pText.indexOf("]", idx);
127                                        if (endOfLink > -1) {
128                                                idx = endOfLink + 1;
129                                                continue;
130                                        }
131                                }
132                        } else if (c == ']') {
133                                // look ahead
134                                if (idx + 1 < pText.length() && pText.charAt(idx + 1) == ']') {
135                                        idx = idx + 2;
136                                        continue;
137                                }
138                        } else if (c == '=') {
139                                // look ahead
140                                if (idx + 1 < pText.length() && pText.charAt(idx + 1) == '=') {
141                                        int endHeaderIdx = pText.indexOf("==", idx+2);
142                                        if (endHeaderIdx > -1) {
143                                                idx = endHeaderIdx + 2;
144                                        }
145                                        continue;
146                                }
147                        }
148
149                        summary.append(c);
150                        idx++;
151                }
152
153                String textString = removeIndentAtBeginning(summary.toString());
154                // remove empty parenthesis
155                textString = textString.replaceAll("\\([^\\w]*\\)", "");
156                // remove comma in front of parenthesis
157                textString = textString.replaceAll("\\([, ]*", "(");
158                textString = textString.replaceAll("[, ]*\\)", ")");
159
160                textString = removeWhiteSpace(
161                                textString.replaceAll("\r", " ").replaceAll("\n", " ")
162                                                .replaceAll("\t", " ")).trim();
163                textString = removeBold(textString);
164                textString = removeItalic(textString);
165
166                // convert 'non breaking html spaces' into blanks. But preserve them
167                // (don't remove white space)
168                textString = textString.replaceAll("&nbsp;", " ");
169                textString = textString.replaceAll("\\( ", "(");
170                textString = textString.replaceAll(" \\)", ")");
171
172                // find full stop near length of text
173                int endOfTextIdx = textString.length();
174
175                if (stopAtParagraph) {
176                        // only look at first paragraph for summary
177                        int paragraph = textString.indexOf("==");
178                        if (paragraph > 10) {
179                                endOfTextIdx = paragraph;
180                        }
181                }
182
183                //
184                if (endOfTextIdx < 20 || endOfTextIdx > length) {
185                        endOfTextIdx = textString.lastIndexOf(".", length);
186                        if (endOfTextIdx < 0.7 * length) {
187                                endOfTextIdx = textString.lastIndexOf(" ", length);
188                        }
189                }
190
191                // add elipsis if we have shortened the article
192                if (endOfTextIdx > -1 && endOfTextIdx < textString.length()) {
193                        textString = textString.substring(0, endOfTextIdx) + " (...)";
194                }
195
196                // trim trailing spaces and return
197                return textString.trim();
198        }
199
200        /**
201         * skips templates in wikipedia markup. Templates are enclosed within braces
202         * {}. There might be nested templates within an other template.
203         * 
204         * @param pText
205         *            : the wikipedia text with templates
206         * @param pIdx
207         *            , pos in text to start with, MUST be a {
208         * @return the idx into the text where the template ends, or the last
209         *         character in the text if it does not properly end.
210         */
211        static int skipTemplate(String pText, int pIdx) {
212                // make sure we start with opening braces
213                if (pText.charAt(pIdx) != '{') {
214                        return pIdx;
215                }
216
217                // counter for the braces we have opened, braces might be recursive
218                // we use an iterative implementation, since it is a tiny little bit
219                // faster
220                int numOpenings = 1;
221                // start with the next character
222                int idx = pIdx + 1;
223                // loop over the text starting from the next character till the end of
224                // the template or the end of the text
225                while (numOpenings > 0 && pText.length() > idx) {
226                        if (pText.charAt(idx) == '{') {
227                                numOpenings++;
228                        } else if (pText.charAt(idx) == '}') {
229                                numOpenings--;
230                        }
231                        idx++;
232                }
233                return idx;
234        }
235
236        /**
237         * @param pText
238         * @param pIdx
239         *            , pos in text to start with, MUST be a {
240         * @return
241         */
242        static int skipHTMLElement(String pText, int pIdx) {
243                if (pText.charAt(pIdx) != '<') {
244                        return pIdx;
245                }
246
247                int numOpenings = 1;
248                int idx = pIdx + 1;
249                while (numOpenings > 0 && pText.length() > idx) {
250                        if (pText.charAt(idx) == '<') {
251                                numOpenings++;
252                        } else if (pText.charAt(idx) == '>') {
253                                numOpenings--;
254                        }
255                        idx++;
256                }
257                return idx;
258        }
259
260/**
261         * @param pText
262         * @param pIdx,
263         *            pos in text to start with, MUST be a '<'
264         * @return
265         */
266        static int skipHTMLComment(String pText, int pIdx) {
267                if (pText.charAt(pIdx) != '<' && pText.charAt(pIdx + 1) != '!') {
268                        return pIdx;
269                }
270
271                int idx = pIdx;
272                while ((idx = pText.indexOf('-', idx)) > -1) {
273                        if (pText.length() < idx + 2) {
274                                return pText.length();
275                        }
276                        if (pText.charAt(idx) == '-' && pText.charAt(idx + 1) == '-'
277                                        && pText.charAt(idx + 2) == '>') {
278                                return idx + 3;
279                        }
280                        idx++;
281                }
282                return idx;
283        }
284
285        private static String removeIndentAtBeginning(String pText) {
286                pText = pText.trim();
287                if (pText.startsWith(":")) {
288                        int lineFeed = pText.indexOf("\n");
289                        if (lineFeed > -1) {
290                                pText = pText.substring(lineFeed + 1);
291                        } else {
292                                // we may already have removed the linefeed
293                                // check for italics
294                                if (pText.startsWith(":''")) {
295                                        int italic = pText.indexOf("''", 3);
296                                        if (italic > -1) {
297                                                pText = pText.substring(italic + 2);
298                                        }
299                                }
300                        }
301                }
302                return pText;
303        }
304
305        private static int findEndOfLink(String pText, int pIdx) {
306                int end = pText.indexOf("]]", pIdx);
307                if (end == -1) {
308                        return pIdx;
309                }
310
311                int idx = pIdx;
312                int openingIdx = pText.indexOf("[[", idx + 2);
313                while (openingIdx > -1 && openingIdx < end) {
314                        idx = end;
315                        end = pText.indexOf("]]", end + 2);
316                        openingIdx = pText.indexOf("[[", idx);
317                }
318                if (end != -1) {
319                        idx = end;
320                }
321                return idx;
322        }
323
324        /**
325         * removes sequences of whitespace and keeps only one whitespace character
326         * 
327         * @param pString
328         * @return
329         */
330        public static String removeWhiteSpace(String pString) {
331                StringBuffer buf = new StringBuffer();
332                char[] chars = pString.toCharArray();
333                int counter = 0;
334                for (int i = 0; i < chars.length; i++) {
335                        if (chars[i] == ' ') {
336                                if (counter == 0) {
337                                        buf.append(chars[i]);
338                                }
339                                counter++;
340                        } else {
341                                buf.append(chars[i]);
342                                counter = 0;
343                        }
344                }
345                return buf.toString();
346        }
347
348        public static String removeBold(String pString) {
349                return pString.replaceAll("'''", "");
350        }
351
352        public static String removeItalic(String pString) {
353                return pString.replaceAll("''", "");
354        }
355
356}