comparison dwtx/jface/internal/text/html/HTML2TextReader.d @ 129:eb30df5ca28b

Added JFace Text sources
author Frank Benoit <benoit@tionex.de>
date Sat, 23 Aug 2008 19:10:48 +0200
parents
children c4fb132a086c
comparison
equal deleted inserted replaced
128:8df1d4193877 129:eb30df5ca28b
1 /*******************************************************************************
2 * Copyright (c) 2000, 2008 IBM Corporation and others.
3 * All rights reserved. This program and the accompanying materials
4 * are made available under the terms of the Eclipse Public License v1.0
5 * which accompanies this distribution, and is available at
6 * http://www.eclipse.org/legal/epl-v10.html
7 *
8 * Contributors:
9 * IBM Corporation - initial API and implementation
10 * Port to the D programming language:
11 * Frank Benoit <benoit@tionex.de>
12 *******************************************************************************/
13 module dwtx.jface.internal.text.html.HTML2TextReader;
14
15 import dwt.dwthelper.utils;
16
17 import java.io.IOException;
18 import java.io.PushbackReader;
19 import java.io.Reader;
20 import java.util.HashMap;
21 import java.util.HashSet;
22 import java.util.Map;
23 import java.util.Set;
24
25 import dwt.DWT;
26 import dwt.custom.StyleRange;
27 import dwtx.jface.text.TextPresentation;
28
29
30 /**
31 * Reads the text contents from a reader of HTML contents and translates
32 * the tags or cut them out.
33 * <p>
34 * Moved into this package from <code>dwtx.jface.internal.text.revisions</code>.</p>
35 */
36 public class HTML2TextReader : SubstitutionTextReader {
37
38 private static final String EMPTY_STRING= ""; //$NON-NLS-1$
39 private static final Map fgEntityLookup;
40 private static final Set fgTags;
41
42 static {
43
44 fgTags= new HashSet();
45 fgTags.add("b"); //$NON-NLS-1$
46 fgTags.add("br"); //$NON-NLS-1$
47 fgTags.add("br/"); //$NON-NLS-1$
48 fgTags.add("div"); //$NON-NLS-1$
49 fgTags.add("h1"); //$NON-NLS-1$
50 fgTags.add("h2"); //$NON-NLS-1$
51 fgTags.add("h3"); //$NON-NLS-1$
52 fgTags.add("h4"); //$NON-NLS-1$
53 fgTags.add("h5"); //$NON-NLS-1$
54 fgTags.add("p"); //$NON-NLS-1$
55 fgTags.add("dl"); //$NON-NLS-1$
56 fgTags.add("dt"); //$NON-NLS-1$
57 fgTags.add("dd"); //$NON-NLS-1$
58 fgTags.add("li"); //$NON-NLS-1$
59 fgTags.add("ul"); //$NON-NLS-1$
60 fgTags.add("pre"); //$NON-NLS-1$
61 fgTags.add("head"); //$NON-NLS-1$
62
63 fgEntityLookup= new HashMap(7);
64 fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$
65 fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$
66 fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$
67 fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$
68 fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$
69 fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$
70 fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$
71 }
72
73 private int fCounter= 0;
74 private TextPresentation fTextPresentation;
75 private int fBold= 0;
76 private int fStartOffset= -1;
77 private bool fInParagraph= false;
78 private bool fIsPreformattedText= false;
79 private bool fIgnore= false;
80 private bool fHeaderDetected= false;
81
82 /**
83 * Transforms the HTML text from the reader to formatted text.
84 *
85 * @param reader the reader
86 * @param presentation If not <code>null</code>, formattings will be applied to
87 * the presentation.
88 */
89 public HTML2TextReader(Reader reader, TextPresentation presentation) {
90 super(new PushbackReader(reader));
91 fTextPresentation= presentation;
92 }
93
94 public int read() throws IOException {
95 int c= super.read();
96 if (c !is -1)
97 ++ fCounter;
98 return c;
99 }
100
101 protected void startBold() {
102 if (fBold is 0)
103 fStartOffset= fCounter;
104 ++ fBold;
105 }
106
107 protected void startPreformattedText() {
108 fIsPreformattedText= true;
109 setSkipWhitespace(false);
110 }
111
112 protected void stopPreformattedText() {
113 fIsPreformattedText= false;
114 setSkipWhitespace(true);
115 }
116
117 protected void stopBold() {
118 -- fBold;
119 if (fBold is 0) {
120 if (fTextPresentation !is null) {
121 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, DWT.BOLD));
122 }
123 fStartOffset= -1;
124 }
125 }
126
127 /*
128 * @see dwtx.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int)
129 */
130 protected String computeSubstitution(int c) throws IOException {
131
132 if (c is '<')
133 return processHTMLTag();
134 else if (fIgnore)
135 return EMPTY_STRING;
136 else if (c is '&')
137 return processEntity();
138 else if (fIsPreformattedText)
139 return processPreformattedText(c);
140
141 return null;
142 }
143
144 private String html2Text(String html) {
145
146 if (html is null || html.length() is 0)
147 return EMPTY_STRING;
148
149 html= html.toLowerCase();
150
151 String tag= html;
152 if ('/' is tag.charAt(0))
153 tag= tag.substring(1);
154
155 if (!fgTags.contains(tag))
156 return EMPTY_STRING;
157
158
159 if ("pre".equals(html)) { //$NON-NLS-1$
160 startPreformattedText();
161 return EMPTY_STRING;
162 }
163
164 if ("/pre".equals(html)) { //$NON-NLS-1$
165 stopPreformattedText();
166 return EMPTY_STRING;
167 }
168
169 if (fIsPreformattedText)
170 return EMPTY_STRING;
171
172 if ("b".equals(html)) { //$NON-NLS-1$
173 startBold();
174 return EMPTY_STRING;
175 }
176
177 if ((html.length() > 1 && html.charAt(0) is 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$
178 startBold();
179 return EMPTY_STRING;
180 }
181
182 if ("dl".equals(html)) //$NON-NLS-1$
183 return LINE_DELIM;
184
185 if ("dd".equals(html)) //$NON-NLS-1$
186 return "\t"; //$NON-NLS-1$
187
188 if ("li".equals(html)) //$NON-NLS-1$
189 // FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682
190 return LINE_DELIM + HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$
191
192 if ("/b".equals(html)) { //$NON-NLS-1$
193 stopBold();
194 return EMPTY_STRING;
195 }
196
197 if ("p".equals(html)) { //$NON-NLS-1$
198 fInParagraph= true;
199 return LINE_DELIM;
200 }
201
202 if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
203 return LINE_DELIM;
204
205 if ("/p".equals(html)) { //$NON-NLS-1$
206 bool inParagraph= fInParagraph;
207 fInParagraph= false;
208 return inParagraph ? EMPTY_STRING : LINE_DELIM;
209 }
210
211 if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
212 stopBold();
213 return LINE_DELIM;
214 }
215
216 if ("/dd".equals(html)) //$NON-NLS-1$
217 return LINE_DELIM;
218
219 if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$
220 fHeaderDetected= true;
221 fIgnore= true;
222 return EMPTY_STRING;
223 }
224
225 if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$
226 fIgnore= false;
227 return EMPTY_STRING;
228 }
229
230 return EMPTY_STRING;
231 }
232
233 /*
234 * A '<' has been read. Process a html tag
235 */
236 private String processHTMLTag() throws IOException {
237
238 StringBuffer buf= new StringBuffer();
239 int ch;
240 do {
241
242 ch= nextChar();
243
244 while (ch !is -1 && ch !is '>') {
245 buf.append(Character.toLowerCase((char) ch));
246 ch= nextChar();
247 if (ch is '"'){
248 buf.append(Character.toLowerCase((char) ch));
249 ch= nextChar();
250 while (ch !is -1 && ch !is '"'){
251 buf.append(Character.toLowerCase((char) ch));
252 ch= nextChar();
253 }
254 }
255 if (ch is '<' && !isInComment(buf)) {
256 unread(ch);
257 return '<' + buf.toString();
258 }
259 }
260
261 if (ch is -1)
262 return null;
263
264 if (!isInComment(buf) || isCommentEnd(buf)) {
265 break;
266 }
267 // unfinished comment
268 buf.append((char) ch);
269 } while (true);
270
271 return html2Text(buf.toString());
272 }
273
274 private static bool isInComment(StringBuffer buf) {
275 return buf.length() >= 3 && "!--".equals(buf.substring(0, 3)); //$NON-NLS-1$
276 }
277
278 private static bool isCommentEnd(StringBuffer buf) {
279 int tagLen= buf.length();
280 return tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)); //$NON-NLS-1$
281 }
282
283 private String processPreformattedText(int c) {
284 if (c is '\r' || c is '\n')
285 fCounter++;
286 return null;
287 }
288
289
290 private void unread(int ch) throws IOException {
291 ((PushbackReader) getReader()).unread(ch);
292 }
293
294 protected String entity2Text(String symbol) {
295 if (symbol.length() > 1 && symbol.charAt(0) is '#') {
296 int ch;
297 try {
298 if (symbol.charAt(1) is 'x') {
299 ch= Integer.parseInt(symbol.substring(2), 16);
300 } else {
301 ch= Integer.parseInt(symbol.substring(1), 10);
302 }
303 return EMPTY_STRING + (char)ch;
304 } catch (NumberFormatException e) {
305 }
306 } else {
307 String str= (String) fgEntityLookup.get(symbol);
308 if (str !is null) {
309 return str;
310 }
311 }
312 return "&" + symbol; // not found //$NON-NLS-1$
313 }
314
315 /*
316 * A '&' has been read. Process a entity
317 */
318 private String processEntity() throws IOException {
319 StringBuffer buf= new StringBuffer();
320 int ch= nextChar();
321 while (Character.isLetterOrDigit((char)ch) || ch is '#') {
322 buf.append((char) ch);
323 ch= nextChar();
324 }
325
326 if (ch is ';')
327 return entity2Text(buf.toString());
328
329 buf.insert(0, '&');
330 if (ch !is -1)
331 buf.append((char) ch);
332 return buf.toString();
333 }
334 }