Mercurial > projects > dwt-addons
annotate dwtx/jface/internal/text/html/HTML2TextReader.d @ 162:1a5b8f8129df
...
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Mon, 08 Sep 2008 00:51:37 +0200 |
parents | 25f1f92fa3df |
children | c6d7b1ea700b |
rev | line source |
---|---|
129 | 1 /******************************************************************************* |
2 * Copyright (c) 2000, 2008 IBM Corporation and others. | |
3 * All rights reserved. This program and the accompanying materials | |
4 * are made available under the terms of the Eclipse Public License v1.0 | |
5 * which accompanies this distribution, and is available at | |
6 * http://www.eclipse.org/legal/epl-v10.html | |
7 * | |
8 * Contributors: | |
9 * IBM Corporation - initial API and implementation | |
10 * Port to the D programming language: | |
11 * Frank Benoit <benoit@tionex.de> | |
12 *******************************************************************************/ | |
13 module dwtx.jface.internal.text.html.HTML2TextReader; | |
14 | |
131 | 15 import dwtx.jface.internal.text.html.HTMLPrinter; // packageimport |
16 import dwtx.jface.internal.text.html.BrowserInformationControl; // packageimport | |
17 import dwtx.jface.internal.text.html.SubstitutionTextReader; // packageimport | |
18 import dwtx.jface.internal.text.html.HTMLTextPresenter; // packageimport | |
19 import dwtx.jface.internal.text.html.BrowserInput; // packageimport | |
20 import dwtx.jface.internal.text.html.SingleCharReader; // packageimport | |
21 import dwtx.jface.internal.text.html.BrowserInformationControlInput; // packageimport | |
22 import dwtx.jface.internal.text.html.HTMLMessages; // packageimport | |
23 | |
129 | 24 import dwt.dwthelper.utils; |
158 | 25 import dwtx.dwtxhelper.PushbackReader; |
153
f70d9508c95c
Fix java Collection imports
Frank Benoit <benoit@tionex.de>
parents:
150
diff
changeset
|
26 import dwtx.dwtxhelper.Collection; |
162 | 27 static import tango.text.convert.Utf; |
153
f70d9508c95c
Fix java Collection imports
Frank Benoit <benoit@tionex.de>
parents:
150
diff
changeset
|
28 |
129 | 29 import dwt.DWT; |
30 import dwt.custom.StyleRange; | |
31 import dwtx.jface.text.TextPresentation; | |
32 | |
33 | |
34 /** | |
35 * Reads the text contents from a reader of HTML contents and translates | |
36 * the tags or cut them out. | |
37 * <p> | |
38 * Moved into this package from <code>dwtx.jface.internal.text.revisions</code>.</p> | |
39 */ | |
40 public class HTML2TextReader : SubstitutionTextReader { | |
41 | |
147 | 42 private static const String EMPTY_STRING= ""; //$NON-NLS-1$ |
43 private static const Map fgEntityLookup; | |
44 private static const Set fgTags; | |
129 | 45 |
150 | 46 static this() { |
129 | 47 |
48 fgTags= new HashSet(); | |
49 fgTags.add("b"); //$NON-NLS-1$ | |
50 fgTags.add("br"); //$NON-NLS-1$ | |
51 fgTags.add("br/"); //$NON-NLS-1$ | |
52 fgTags.add("div"); //$NON-NLS-1$ | |
53 fgTags.add("h1"); //$NON-NLS-1$ | |
54 fgTags.add("h2"); //$NON-NLS-1$ | |
55 fgTags.add("h3"); //$NON-NLS-1$ | |
56 fgTags.add("h4"); //$NON-NLS-1$ | |
57 fgTags.add("h5"); //$NON-NLS-1$ | |
58 fgTags.add("p"); //$NON-NLS-1$ | |
59 fgTags.add("dl"); //$NON-NLS-1$ | |
60 fgTags.add("dt"); //$NON-NLS-1$ | |
61 fgTags.add("dd"); //$NON-NLS-1$ | |
62 fgTags.add("li"); //$NON-NLS-1$ | |
63 fgTags.add("ul"); //$NON-NLS-1$ | |
64 fgTags.add("pre"); //$NON-NLS-1$ | |
65 fgTags.add("head"); //$NON-NLS-1$ | |
66 | |
67 fgEntityLookup= new HashMap(7); | |
68 fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$ | |
69 fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$ | |
70 fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$ | |
71 fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$ | |
72 fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$ | |
73 fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$ | |
74 fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$ | |
75 } | |
76 | |
77 private int fCounter= 0; | |
78 private TextPresentation fTextPresentation; | |
79 private int fBold= 0; | |
80 private int fStartOffset= -1; | |
81 private bool fInParagraph= false; | |
82 private bool fIsPreformattedText= false; | |
83 private bool fIgnore= false; | |
84 private bool fHeaderDetected= false; | |
85 | |
86 /** | |
87 * Transforms the HTML text from the reader to formatted text. | |
88 * | |
89 * @param reader the reader | |
90 * @param presentation If not <code>null</code>, formattings will be applied to | |
91 * the presentation. | |
92 */ | |
133
7d818bd32d63
Fix ctors to this with gvim regexp
Frank Benoit <benoit@tionex.de>
parents:
131
diff
changeset
|
93 public this(Reader reader, TextPresentation presentation) { |
129 | 94 super(new PushbackReader(reader)); |
95 fTextPresentation= presentation; | |
96 } | |
97 | |
136
6dcb0baaa031
Regex removal of throws decls, some instanceof
Frank Benoit <benoit@tionex.de>
parents:
134
diff
changeset
|
98 public int read() { |
129 | 99 int c= super.read(); |
100 if (c !is -1) | |
101 ++ fCounter; | |
102 return c; | |
103 } | |
104 | |
105 protected void startBold() { | |
106 if (fBold is 0) | |
107 fStartOffset= fCounter; | |
108 ++ fBold; | |
109 } | |
110 | |
111 protected void startPreformattedText() { | |
112 fIsPreformattedText= true; | |
113 setSkipWhitespace(false); | |
114 } | |
115 | |
116 protected void stopPreformattedText() { | |
117 fIsPreformattedText= false; | |
118 setSkipWhitespace(true); | |
119 } | |
120 | |
121 protected void stopBold() { | |
122 -- fBold; | |
123 if (fBold is 0) { | |
124 if (fTextPresentation !is null) { | |
125 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, DWT.BOLD)); | |
126 } | |
127 fStartOffset= -1; | |
128 } | |
129 } | |
130 | |
131 /* | |
132 * @see dwtx.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int) | |
133 */ | |
136
6dcb0baaa031
Regex removal of throws decls, some instanceof
Frank Benoit <benoit@tionex.de>
parents:
134
diff
changeset
|
134 protected String computeSubstitution(int c) { |
129 | 135 |
136 if (c is '<') | |
137 return processHTMLTag(); | |
138 else if (fIgnore) | |
139 return EMPTY_STRING; | |
140 else if (c is '&') | |
141 return processEntity(); | |
142 else if (fIsPreformattedText) | |
143 return processPreformattedText(c); | |
144 | |
145 return null; | |
146 } | |
147 | |
148 private String html2Text(String html) { | |
149 | |
150 if (html is null || html.length() is 0) | |
151 return EMPTY_STRING; | |
152 | |
153 html= html.toLowerCase(); | |
150 | 154 |
129 | 155 String tag= html; |
156 if ('/' is tag.charAt(0)) | |
157 tag= tag.substring(1); | |
158 | |
159 if (!fgTags.contains(tag)) | |
160 return EMPTY_STRING; | |
161 | |
162 | |
163 if ("pre".equals(html)) { //$NON-NLS-1$ | |
164 startPreformattedText(); | |
165 return EMPTY_STRING; | |
166 } | |
167 | |
168 if ("/pre".equals(html)) { //$NON-NLS-1$ | |
169 stopPreformattedText(); | |
170 return EMPTY_STRING; | |
171 } | |
172 | |
173 if (fIsPreformattedText) | |
174 return EMPTY_STRING; | |
175 | |
176 if ("b".equals(html)) { //$NON-NLS-1$ | |
177 startBold(); | |
178 return EMPTY_STRING; | |
179 } | |
180 | |
181 if ((html.length() > 1 && html.charAt(0) is 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$ | |
182 startBold(); | |
183 return EMPTY_STRING; | |
184 } | |
185 | |
186 if ("dl".equals(html)) //$NON-NLS-1$ | |
187 return LINE_DELIM; | |
188 | |
189 if ("dd".equals(html)) //$NON-NLS-1$ | |
190 return "\t"; //$NON-NLS-1$ | |
191 | |
192 if ("li".equals(html)) //$NON-NLS-1$ | |
193 // FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682 | |
162 | 194 return LINE_DELIM ~ HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$ |
129 | 195 |
196 if ("/b".equals(html)) { //$NON-NLS-1$ | |
197 stopBold(); | |
198 return EMPTY_STRING; | |
199 } | |
200 | |
201 if ("p".equals(html)) { //$NON-NLS-1$ | |
202 fInParagraph= true; | |
203 return LINE_DELIM; | |
204 } | |
205 | |
206 if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ | |
207 return LINE_DELIM; | |
208 | |
209 if ("/p".equals(html)) { //$NON-NLS-1$ | |
210 bool inParagraph= fInParagraph; | |
211 fInParagraph= false; | |
212 return inParagraph ? EMPTY_STRING : LINE_DELIM; | |
213 } | |
214 | |
215 if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ | |
216 stopBold(); | |
217 return LINE_DELIM; | |
218 } | |
219 | |
220 if ("/dd".equals(html)) //$NON-NLS-1$ | |
221 return LINE_DELIM; | |
150 | 222 |
129 | 223 if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$ |
224 fHeaderDetected= true; | |
225 fIgnore= true; | |
226 return EMPTY_STRING; | |
227 } | |
150 | 228 |
129 | 229 if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$ |
230 fIgnore= false; | |
231 return EMPTY_STRING; | |
232 } | |
233 | |
234 return EMPTY_STRING; | |
235 } | |
236 | |
237 /* | |
238 * A '<' has been read. Process a html tag | |
239 */ | |
136
6dcb0baaa031
Regex removal of throws decls, some instanceof
Frank Benoit <benoit@tionex.de>
parents:
134
diff
changeset
|
240 private String processHTMLTag() { |
129 | 241 |
242 StringBuffer buf= new StringBuffer(); | |
243 int ch; | |
244 do { | |
245 | |
162 | 246 ch= nextDChar(); |
129 | 247 |
248 while (ch !is -1 && ch !is '>') { | |
162 | 249 buf.append(dcharToString(Character.toLowerCase(cast(dchar) ch))); |
250 ch= nextDChar(); | |
129 | 251 if (ch is '"'){ |
162 | 252 buf.append(dcharToString(Character.toLowerCase(cast(dchar) ch))); |
253 ch= nextDChar(); | |
129 | 254 while (ch !is -1 && ch !is '"'){ |
162 | 255 buf.append(dcharToString(Character.toLowerCase(cast(dchar) ch))); |
256 ch= nextDChar(); | |
129 | 257 } |
258 } | |
259 if (ch is '<' && !isInComment(buf)) { | |
162 | 260 unreadDChar(ch); |
261 return '<' ~ buf.toString(); | |
129 | 262 } |
263 } | |
264 | |
265 if (ch is -1) | |
266 return null; | |
267 | |
268 if (!isInComment(buf) || isCommentEnd(buf)) { | |
269 break; | |
270 } | |
271 // unfinished comment | |
162 | 272 buf.append(dcharToString(cast(dchar) ch)); |
129 | 273 } while (true); |
274 | |
275 return html2Text(buf.toString()); | |
276 } | |
150 | 277 |
129 | 278 private static bool isInComment(StringBuffer buf) { |
162 | 279 return buf.length() >= 3 && "!--".equals(buf.slice().substring(0, 3)); //$NON-NLS-1$ |
129 | 280 } |
150 | 281 |
129 | 282 private static bool isCommentEnd(StringBuffer buf) { |
283 int tagLen= buf.length(); | |
162 | 284 return tagLen >= 5 && "--".equals(buf.slice().substring(tagLen - 2)); //$NON-NLS-1$ |
129 | 285 } |
286 | |
287 private String processPreformattedText(int c) { | |
288 if (c is '\r' || c is '\n') | |
289 fCounter++; | |
290 return null; | |
291 } | |
292 | |
293 | |
162 | 294 private void unreadDChar(dchar ch) { |
295 char[4] buf; | |
296 dchar[1] ibuf; | |
297 ibuf[0] = ch; | |
298 foreach( char c; tango.text.convert.Utf.toString( ibuf[], buf[] )){ | |
299 (cast(PushbackReader) getReader()).unread(c); | |
300 } | |
129 | 301 } |
302 | |
303 protected String entity2Text(String symbol) { | |
304 if (symbol.length() > 1 && symbol.charAt(0) is '#') { | |
305 int ch; | |
306 try { | |
307 if (symbol.charAt(1) is 'x') { | |
308 ch= Integer.parseInt(symbol.substring(2), 16); | |
309 } else { | |
310 ch= Integer.parseInt(symbol.substring(1), 10); | |
311 } | |
162 | 312 return dcharToString( cast(dchar)ch); |
129 | 313 } catch (NumberFormatException e) { |
314 } | |
315 } else { | |
162 | 316 String str= stringcast( fgEntityLookup.get(symbol)); |
129 | 317 if (str !is null) { |
318 return str; | |
319 } | |
320 } | |
162 | 321 return "&" ~ symbol; // not found //$NON-NLS-1$ |
129 | 322 } |
323 | |
324 /* | |
325 * A '&' has been read. Process a entity | |
326 */ | |
136
6dcb0baaa031
Regex removal of throws decls, some instanceof
Frank Benoit <benoit@tionex.de>
parents:
134
diff
changeset
|
327 private String processEntity() { |
129 | 328 StringBuffer buf= new StringBuffer(); |
162 | 329 int ch= nextDChar(); |
330 while (Character.isLetterOrDigit(cast(dchar)ch) || ch is '#') { | |
331 buf.append(dcharToString(cast(dchar) ch)); | |
332 ch= nextDChar(); | |
129 | 333 } |
334 | |
335 if (ch is ';') | |
336 return entity2Text(buf.toString()); | |
337 | |
162 | 338 buf.select(0, 0); |
339 buf.prepend("&"); | |
129 | 340 if (ch !is -1) |
162 | 341 buf.append(dcharToString(cast(dchar) ch)); |
129 | 342 return buf.toString(); |
343 } | |
344 } |