diff dwtx/jface/internal/text/html/HTML2TextReader.d @ 129:eb30df5ca28b

Added JFace Text sources
author Frank Benoit <benoit@tionex.de>
date Sat, 23 Aug 2008 19:10:48 +0200
parents
children c4fb132a086c
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dwtx/jface/internal/text/html/HTML2TextReader.d	Sat Aug 23 19:10:48 2008 +0200
@@ -0,0 +1,334 @@
+/*******************************************************************************
+ * Copyright (c) 2000, 2008 IBM Corporation and others.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License v1.0
+ * which accompanies this distribution, and is available at
+ * http://www.eclipse.org/legal/epl-v10.html
+ *
+ * Contributors:
+ *     IBM Corporation - initial API and implementation
+ * Port to the D programming language:
+ *     Frank Benoit <benoit@tionex.de>
+ *******************************************************************************/
+module dwtx.jface.internal.text.html.HTML2TextReader;
+
+import dwt.dwthelper.utils;
+
+import java.io.IOException;
+import java.io.PushbackReader;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import dwt.DWT;
+import dwt.custom.StyleRange;
+import dwtx.jface.text.TextPresentation;
+
+
+/**
+ * Reads the text contents from a reader of HTML contents and translates
+ * the tags or cut them out.
+ * <p>
+ * Moved into this package from <code>dwtx.jface.internal.text.revisions</code>.</p>
+ */
+public class HTML2TextReader : SubstitutionTextReader {
+
+    private static final String EMPTY_STRING= ""; //$NON-NLS-1$
+    private static final Map fgEntityLookup;
+    private static final Set fgTags;
+
+    static {
+
+        fgTags= new HashSet();
+        fgTags.add("b"); //$NON-NLS-1$
+        fgTags.add("br"); //$NON-NLS-1$
+        fgTags.add("br/"); //$NON-NLS-1$
+        fgTags.add("div"); //$NON-NLS-1$
+        fgTags.add("h1"); //$NON-NLS-1$
+        fgTags.add("h2"); //$NON-NLS-1$
+        fgTags.add("h3"); //$NON-NLS-1$
+        fgTags.add("h4"); //$NON-NLS-1$
+        fgTags.add("h5"); //$NON-NLS-1$
+        fgTags.add("p"); //$NON-NLS-1$
+        fgTags.add("dl"); //$NON-NLS-1$
+        fgTags.add("dt"); //$NON-NLS-1$
+        fgTags.add("dd"); //$NON-NLS-1$
+        fgTags.add("li"); //$NON-NLS-1$
+        fgTags.add("ul"); //$NON-NLS-1$
+        fgTags.add("pre"); //$NON-NLS-1$
+        fgTags.add("head"); //$NON-NLS-1$
+
+        fgEntityLookup= new HashMap(7);
+        fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$
+        fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$
+        fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$
+        fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$
+        fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$
+        fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$
+        fgEntityLookup.put("quot", "\"");        //$NON-NLS-1$ //$NON-NLS-2$
+    }
+
+    private int fCounter= 0;
+    private TextPresentation fTextPresentation;
+    private int fBold= 0;
+    private int fStartOffset= -1;
+    private bool fInParagraph= false;
+    private bool fIsPreformattedText= false;
+    private bool fIgnore= false;
+    private bool fHeaderDetected= false;
+
+    /**
+     * Transforms the HTML text from the reader to formatted text.
+     *
+     * @param reader the reader
+     * @param presentation If not <code>null</code>, formattings will be applied to
+     * the presentation.
+    */
+    public HTML2TextReader(Reader reader, TextPresentation presentation) {
+        super(new PushbackReader(reader));
+        fTextPresentation= presentation;
+    }
+
+    public int read() throws IOException {
+        int c= super.read();
+        if (c !is -1)
+            ++ fCounter;
+        return c;
+    }
+
+    protected void startBold() {
+        if (fBold is 0)
+            fStartOffset= fCounter;
+        ++ fBold;
+    }
+
+    protected void startPreformattedText() {
+        fIsPreformattedText= true;
+        setSkipWhitespace(false);
+    }
+
+    protected void stopPreformattedText() {
+        fIsPreformattedText= false;
+        setSkipWhitespace(true);
+    }
+
+    protected void stopBold() {
+        -- fBold;
+        if (fBold is 0) {
+            if (fTextPresentation !is null) {
+                fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, DWT.BOLD));
+            }
+            fStartOffset= -1;
+        }
+    }
+
+    /*
+     * @see dwtx.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int)
+     */
+    protected String computeSubstitution(int c) throws IOException {
+
+        if (c is '<')
+            return  processHTMLTag();
+        else if (fIgnore)
+            return EMPTY_STRING;
+        else if (c is '&')
+            return processEntity();
+        else if (fIsPreformattedText)
+            return processPreformattedText(c);
+
+        return null;
+    }
+
+    private String html2Text(String html) {
+
+        if (html is null || html.length() is 0)
+            return EMPTY_STRING;
+
+        html= html.toLowerCase();
+        
+        String tag= html;
+        if ('/' is tag.charAt(0))
+            tag= tag.substring(1);
+
+        if (!fgTags.contains(tag))
+            return EMPTY_STRING;
+
+
+        if ("pre".equals(html)) { //$NON-NLS-1$
+            startPreformattedText();
+            return EMPTY_STRING;
+        }
+
+        if ("/pre".equals(html)) { //$NON-NLS-1$
+            stopPreformattedText();
+            return EMPTY_STRING;
+        }
+
+        if (fIsPreformattedText)
+            return EMPTY_STRING;
+
+        if ("b".equals(html)) { //$NON-NLS-1$
+            startBold();
+            return EMPTY_STRING;
+        }
+
+        if ((html.length() > 1 && html.charAt(0) is 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$
+            startBold();
+            return EMPTY_STRING;
+        }
+
+        if ("dl".equals(html)) //$NON-NLS-1$
+            return LINE_DELIM;
+
+        if ("dd".equals(html)) //$NON-NLS-1$
+            return "\t"; //$NON-NLS-1$
+
+        if ("li".equals(html)) //$NON-NLS-1$
+            // FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682
+            return LINE_DELIM + HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$
+
+        if ("/b".equals(html)) { //$NON-NLS-1$
+            stopBold();
+            return EMPTY_STRING;
+        }
+
+        if ("p".equals(html))  { //$NON-NLS-1$
+            fInParagraph= true;
+            return LINE_DELIM;
+        }
+
+        if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
+            return LINE_DELIM;
+
+        if ("/p".equals(html))  { //$NON-NLS-1$
+            bool inParagraph= fInParagraph;
+            fInParagraph= false;
+            return inParagraph ? EMPTY_STRING : LINE_DELIM;
+        }
+
+        if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
+            stopBold();
+            return LINE_DELIM;
+        }
+
+        if ("/dd".equals(html)) //$NON-NLS-1$
+            return LINE_DELIM;
+        
+        if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$
+            fHeaderDetected= true;
+            fIgnore= true;
+            return EMPTY_STRING;
+        }
+        
+        if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$
+            fIgnore= false;
+            return EMPTY_STRING;
+        }
+
+        return EMPTY_STRING;
+    }
+
+    /*
+     * A '<' has been read. Process a html tag
+     */
+    private String processHTMLTag() throws IOException {
+
+        StringBuffer buf= new StringBuffer();
+        int ch;
+        do {
+
+            ch= nextChar();
+
+            while (ch !is -1 && ch !is '>') {
+                buf.append(Character.toLowerCase((char) ch));
+                ch= nextChar();
+                if (ch is '"'){
+                    buf.append(Character.toLowerCase((char) ch));
+                    ch= nextChar();
+                    while (ch !is -1 && ch !is '"'){
+                        buf.append(Character.toLowerCase((char) ch));
+                        ch= nextChar();
+                    }
+                }
+                if (ch is '<' && !isInComment(buf)) {
+                    unread(ch);
+                    return '<' + buf.toString();
+                }
+            }
+
+            if (ch is -1)
+                return null;
+
+            if (!isInComment(buf) || isCommentEnd(buf)) {
+                break;
+            }
+            // unfinished comment
+            buf.append((char) ch);
+        } while (true);
+
+        return html2Text(buf.toString());
+    }
+    
+    private static bool isInComment(StringBuffer buf) {
+        return buf.length() >= 3 && "!--".equals(buf.substring(0, 3)); //$NON-NLS-1$
+    }
+    
+    private static bool isCommentEnd(StringBuffer buf) {
+        int tagLen= buf.length();
+        return tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)); //$NON-NLS-1$
+    }
+
+    private String processPreformattedText(int c) {
+        if  (c is '\r' || c is '\n')
+            fCounter++;
+        return null;
+    }
+
+
+    private void unread(int ch) throws IOException {
+        ((PushbackReader) getReader()).unread(ch);
+    }
+
+    protected String entity2Text(String symbol) {
+        if (symbol.length() > 1 && symbol.charAt(0) is '#') {
+            int ch;
+            try {
+                if (symbol.charAt(1) is 'x') {
+                    ch= Integer.parseInt(symbol.substring(2), 16);
+                } else {
+                    ch= Integer.parseInt(symbol.substring(1), 10);
+                }
+                return EMPTY_STRING + (char)ch;
+            } catch (NumberFormatException e) {
+            }
+        } else {
+            String str= (String) fgEntityLookup.get(symbol);
+            if (str !is null) {
+                return str;
+            }
+        }
+        return "&" + symbol; // not found //$NON-NLS-1$
+    }
+
+    /*
+     * A '&' has been read. Process a entity
+     */
+    private String processEntity() throws IOException {
+        StringBuffer buf= new StringBuffer();
+        int ch= nextChar();
+        while (Character.isLetterOrDigit((char)ch) || ch is '#') {
+            buf.append((char) ch);
+            ch= nextChar();
+        }
+
+        if (ch is ';')
+            return entity2Text(buf.toString());
+
+        buf.insert(0, '&');
+        if (ch !is -1)
+            buf.append((char) ch);
+        return buf.toString();
+    }
+}