Mercurial > projects > dwt-addons
annotate dwtx/jface/internal/text/html/HTML2TextReader.d @ 158:25f1f92fa3df
...
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Tue, 26 Aug 2008 02:46:34 +0200 |
parents | f70d9508c95c |
children | 1a5b8f8129df |
rev | line source |
---|---|
129 | 1 /******************************************************************************* |
2 * Copyright (c) 2000, 2008 IBM Corporation and others. | |
3 * All rights reserved. This program and the accompanying materials | |
4 * are made available under the terms of the Eclipse Public License v1.0 | |
5 * which accompanies this distribution, and is available at | |
6 * http://www.eclipse.org/legal/epl-v10.html | |
7 * | |
8 * Contributors: | |
9 * IBM Corporation - initial API and implementation | |
10 * Port to the D programming language: | |
11 * Frank Benoit <benoit@tionex.de> | |
12 *******************************************************************************/ | |
13 module dwtx.jface.internal.text.html.HTML2TextReader; | |
14 | |
131 | 15 import dwtx.jface.internal.text.html.HTMLPrinter; // packageimport |
16 import dwtx.jface.internal.text.html.BrowserInformationControl; // packageimport | |
17 import dwtx.jface.internal.text.html.SubstitutionTextReader; // packageimport | |
18 import dwtx.jface.internal.text.html.HTMLTextPresenter; // packageimport | |
19 import dwtx.jface.internal.text.html.BrowserInput; // packageimport | |
20 import dwtx.jface.internal.text.html.SingleCharReader; // packageimport | |
21 import dwtx.jface.internal.text.html.BrowserInformationControlInput; // packageimport | |
22 import dwtx.jface.internal.text.html.HTMLMessages; // packageimport | |
23 | |
129 | 24 import dwt.dwthelper.utils; |
158 | 25 import dwtx.dwtxhelper.PushbackReader; |
153
f70d9508c95c
Fix java Collection imports
Frank Benoit <benoit@tionex.de>
parents:
150
diff
changeset
|
26 import dwtx.dwtxhelper.Collection; |
f70d9508c95c
Fix java Collection imports
Frank Benoit <benoit@tionex.de>
parents:
150
diff
changeset
|
27 |
129 | 28 import dwt.DWT; |
29 import dwt.custom.StyleRange; | |
30 import dwtx.jface.text.TextPresentation; | |
31 | |
32 | |
33 /** | |
34 * Reads the text contents from a reader of HTML contents and translates | |
35 * the tags or cut them out. | |
36 * <p> | |
37 * Moved into this package from <code>dwtx.jface.internal.text.revisions</code>.</p> | |
38 */ | |
39 public class HTML2TextReader : SubstitutionTextReader { | |
40 | |
147 | 41 private static const String EMPTY_STRING= ""; //$NON-NLS-1$ |
42 private static const Map fgEntityLookup; | |
43 private static const Set fgTags; | |
129 | 44 |
150 | 45 static this() { |
129 | 46 |
47 fgTags= new HashSet(); | |
48 fgTags.add("b"); //$NON-NLS-1$ | |
49 fgTags.add("br"); //$NON-NLS-1$ | |
50 fgTags.add("br/"); //$NON-NLS-1$ | |
51 fgTags.add("div"); //$NON-NLS-1$ | |
52 fgTags.add("h1"); //$NON-NLS-1$ | |
53 fgTags.add("h2"); //$NON-NLS-1$ | |
54 fgTags.add("h3"); //$NON-NLS-1$ | |
55 fgTags.add("h4"); //$NON-NLS-1$ | |
56 fgTags.add("h5"); //$NON-NLS-1$ | |
57 fgTags.add("p"); //$NON-NLS-1$ | |
58 fgTags.add("dl"); //$NON-NLS-1$ | |
59 fgTags.add("dt"); //$NON-NLS-1$ | |
60 fgTags.add("dd"); //$NON-NLS-1$ | |
61 fgTags.add("li"); //$NON-NLS-1$ | |
62 fgTags.add("ul"); //$NON-NLS-1$ | |
63 fgTags.add("pre"); //$NON-NLS-1$ | |
64 fgTags.add("head"); //$NON-NLS-1$ | |
65 | |
66 fgEntityLookup= new HashMap(7); | |
67 fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$ | |
68 fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$ | |
69 fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$ | |
70 fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$ | |
71 fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$ | |
72 fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$ | |
73 fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$ | |
74 } | |
75 | |
76 private int fCounter= 0; | |
77 private TextPresentation fTextPresentation; | |
78 private int fBold= 0; | |
79 private int fStartOffset= -1; | |
80 private bool fInParagraph= false; | |
81 private bool fIsPreformattedText= false; | |
82 private bool fIgnore= false; | |
83 private bool fHeaderDetected= false; | |
84 | |
85 /** | |
86 * Transforms the HTML text from the reader to formatted text. | |
87 * | |
88 * @param reader the reader | |
89 * @param presentation If not <code>null</code>, formattings will be applied to | |
90 * the presentation. | |
91 */ | |
133
7d818bd32d63
Fix ctors to this with gvim regexp
Frank Benoit <benoit@tionex.de>
parents:
131
diff
changeset
|
92 public this(Reader reader, TextPresentation presentation) { |
129 | 93 super(new PushbackReader(reader)); |
94 fTextPresentation= presentation; | |
95 } | |
96 | |
136
6dcb0baaa031
Regex removal of throws decls, some instanceof
Frank Benoit <benoit@tionex.de>
parents:
134
diff
changeset
|
97 public int read() { |
129 | 98 int c= super.read(); |
99 if (c !is -1) | |
100 ++ fCounter; | |
101 return c; | |
102 } | |
103 | |
104 protected void startBold() { | |
105 if (fBold is 0) | |
106 fStartOffset= fCounter; | |
107 ++ fBold; | |
108 } | |
109 | |
110 protected void startPreformattedText() { | |
111 fIsPreformattedText= true; | |
112 setSkipWhitespace(false); | |
113 } | |
114 | |
115 protected void stopPreformattedText() { | |
116 fIsPreformattedText= false; | |
117 setSkipWhitespace(true); | |
118 } | |
119 | |
120 protected void stopBold() { | |
121 -- fBold; | |
122 if (fBold is 0) { | |
123 if (fTextPresentation !is null) { | |
124 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, DWT.BOLD)); | |
125 } | |
126 fStartOffset= -1; | |
127 } | |
128 } | |
129 | |
130 /* | |
131 * @see dwtx.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int) | |
132 */ | |
136
6dcb0baaa031
Regex removal of throws decls, some instanceof
Frank Benoit <benoit@tionex.de>
parents:
134
diff
changeset
|
133 protected String computeSubstitution(int c) { |
129 | 134 |
135 if (c is '<') | |
136 return processHTMLTag(); | |
137 else if (fIgnore) | |
138 return EMPTY_STRING; | |
139 else if (c is '&') | |
140 return processEntity(); | |
141 else if (fIsPreformattedText) | |
142 return processPreformattedText(c); | |
143 | |
144 return null; | |
145 } | |
146 | |
147 private String html2Text(String html) { | |
148 | |
149 if (html is null || html.length() is 0) | |
150 return EMPTY_STRING; | |
151 | |
152 html= html.toLowerCase(); | |
150 | 153 |
129 | 154 String tag= html; |
155 if ('/' is tag.charAt(0)) | |
156 tag= tag.substring(1); | |
157 | |
158 if (!fgTags.contains(tag)) | |
159 return EMPTY_STRING; | |
160 | |
161 | |
162 if ("pre".equals(html)) { //$NON-NLS-1$ | |
163 startPreformattedText(); | |
164 return EMPTY_STRING; | |
165 } | |
166 | |
167 if ("/pre".equals(html)) { //$NON-NLS-1$ | |
168 stopPreformattedText(); | |
169 return EMPTY_STRING; | |
170 } | |
171 | |
172 if (fIsPreformattedText) | |
173 return EMPTY_STRING; | |
174 | |
175 if ("b".equals(html)) { //$NON-NLS-1$ | |
176 startBold(); | |
177 return EMPTY_STRING; | |
178 } | |
179 | |
180 if ((html.length() > 1 && html.charAt(0) is 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$ | |
181 startBold(); | |
182 return EMPTY_STRING; | |
183 } | |
184 | |
185 if ("dl".equals(html)) //$NON-NLS-1$ | |
186 return LINE_DELIM; | |
187 | |
188 if ("dd".equals(html)) //$NON-NLS-1$ | |
189 return "\t"; //$NON-NLS-1$ | |
190 | |
191 if ("li".equals(html)) //$NON-NLS-1$ | |
192 // FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682 | |
193 return LINE_DELIM + HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$ | |
194 | |
195 if ("/b".equals(html)) { //$NON-NLS-1$ | |
196 stopBold(); | |
197 return EMPTY_STRING; | |
198 } | |
199 | |
200 if ("p".equals(html)) { //$NON-NLS-1$ | |
201 fInParagraph= true; | |
202 return LINE_DELIM; | |
203 } | |
204 | |
205 if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ | |
206 return LINE_DELIM; | |
207 | |
208 if ("/p".equals(html)) { //$NON-NLS-1$ | |
209 bool inParagraph= fInParagraph; | |
210 fInParagraph= false; | |
211 return inParagraph ? EMPTY_STRING : LINE_DELIM; | |
212 } | |
213 | |
214 if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ | |
215 stopBold(); | |
216 return LINE_DELIM; | |
217 } | |
218 | |
219 if ("/dd".equals(html)) //$NON-NLS-1$ | |
220 return LINE_DELIM; | |
150 | 221 |
129 | 222 if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$ |
223 fHeaderDetected= true; | |
224 fIgnore= true; | |
225 return EMPTY_STRING; | |
226 } | |
150 | 227 |
129 | 228 if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$ |
229 fIgnore= false; | |
230 return EMPTY_STRING; | |
231 } | |
232 | |
233 return EMPTY_STRING; | |
234 } | |
235 | |
236 /* | |
237 * A '<' has been read. Process a html tag | |
238 */ | |
136
6dcb0baaa031
Regex removal of throws decls, some instanceof
Frank Benoit <benoit@tionex.de>
parents:
134
diff
changeset
|
239 private String processHTMLTag() { |
129 | 240 |
241 StringBuffer buf= new StringBuffer(); | |
242 int ch; | |
243 do { | |
244 | |
245 ch= nextChar(); | |
246 | |
247 while (ch !is -1 && ch !is '>') { | |
134 | 248 buf.append(Character.toLowerCase(cast(wchar) ch)); |
129 | 249 ch= nextChar(); |
250 if (ch is '"'){ | |
134 | 251 buf.append(Character.toLowerCase(cast(wchar) ch)); |
129 | 252 ch= nextChar(); |
253 while (ch !is -1 && ch !is '"'){ | |
134 | 254 buf.append(Character.toLowerCase(cast(wchar) ch)); |
129 | 255 ch= nextChar(); |
256 } | |
257 } | |
258 if (ch is '<' && !isInComment(buf)) { | |
259 unread(ch); | |
260 return '<' + buf.toString(); | |
261 } | |
262 } | |
263 | |
264 if (ch is -1) | |
265 return null; | |
266 | |
267 if (!isInComment(buf) || isCommentEnd(buf)) { | |
268 break; | |
269 } | |
270 // unfinished comment | |
134 | 271 buf.append(cast(wchar) ch); |
129 | 272 } while (true); |
273 | |
274 return html2Text(buf.toString()); | |
275 } | |
150 | 276 |
129 | 277 private static bool isInComment(StringBuffer buf) { |
278 return buf.length() >= 3 && "!--".equals(buf.substring(0, 3)); //$NON-NLS-1$ | |
279 } | |
150 | 280 |
129 | 281 private static bool isCommentEnd(StringBuffer buf) { |
282 int tagLen= buf.length(); | |
283 return tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)); //$NON-NLS-1$ | |
284 } | |
285 | |
286 private String processPreformattedText(int c) { | |
287 if (c is '\r' || c is '\n') | |
288 fCounter++; | |
289 return null; | |
290 } | |
291 | |
292 | |
136
6dcb0baaa031
Regex removal of throws decls, some instanceof
Frank Benoit <benoit@tionex.de>
parents:
134
diff
changeset
|
293 private void unread(int ch) { |
134 | 294 (cast(PushbackReader) getReader()).unread(ch); |
129 | 295 } |
296 | |
297 protected String entity2Text(String symbol) { | |
298 if (symbol.length() > 1 && symbol.charAt(0) is '#') { | |
299 int ch; | |
300 try { | |
301 if (symbol.charAt(1) is 'x') { | |
302 ch= Integer.parseInt(symbol.substring(2), 16); | |
303 } else { | |
304 ch= Integer.parseInt(symbol.substring(1), 10); | |
305 } | |
134 | 306 return EMPTY_STRING + cast(wchar)ch; |
129 | 307 } catch (NumberFormatException e) { |
308 } | |
309 } else { | |
134 | 310 String str= cast(String) fgEntityLookup.get(symbol); |
129 | 311 if (str !is null) { |
312 return str; | |
313 } | |
314 } | |
315 return "&" + symbol; // not found //$NON-NLS-1$ | |
316 } | |
317 | |
318 /* | |
319 * A '&' has been read. Process a entity | |
320 */ | |
136
6dcb0baaa031
Regex removal of throws decls, some instanceof
Frank Benoit <benoit@tionex.de>
parents:
134
diff
changeset
|
321 private String processEntity() { |
129 | 322 StringBuffer buf= new StringBuffer(); |
323 int ch= nextChar(); | |
134 | 324 while (Character.isLetterOrDigit(cast(wchar)ch) || ch is '#') { |
325 buf.append(cast(wchar) ch); | |
129 | 326 ch= nextChar(); |
327 } | |
328 | |
329 if (ch is ';') | |
330 return entity2Text(buf.toString()); | |
331 | |
332 buf.insert(0, '&'); | |
333 if (ch !is -1) | |
134 | 334 buf.append(cast(wchar) ch); |
129 | 335 return buf.toString(); |
336 } | |
337 } |