Mercurial > projects > dwt-addons
annotate dwtx/jface/internal/text/html/HTML2TextReader.d @ 133:7d818bd32d63
Fix ctors to this with gvim regexp
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Sun, 24 Aug 2008 01:29:22 +0200 |
parents | c4fb132a086c |
children | 51e6e63f930e |
rev | line source |
---|---|
129 | 1 /******************************************************************************* |
2 * Copyright (c) 2000, 2008 IBM Corporation and others. | |
3 * All rights reserved. This program and the accompanying materials | |
4 * are made available under the terms of the Eclipse Public License v1.0 | |
5 * which accompanies this distribution, and is available at | |
6 * http://www.eclipse.org/legal/epl-v10.html | |
7 * | |
8 * Contributors: | |
9 * IBM Corporation - initial API and implementation | |
10 * Port to the D programming language: | |
11 * Frank Benoit <benoit@tionex.de> | |
12 *******************************************************************************/ | |
13 module dwtx.jface.internal.text.html.HTML2TextReader; | |
14 | |
131 | 15 import dwtx.jface.internal.text.html.HTMLPrinter; // packageimport |
16 import dwtx.jface.internal.text.html.BrowserInformationControl; // packageimport | |
17 import dwtx.jface.internal.text.html.SubstitutionTextReader; // packageimport | |
18 import dwtx.jface.internal.text.html.HTMLTextPresenter; // packageimport | |
19 import dwtx.jface.internal.text.html.BrowserInput; // packageimport | |
20 import dwtx.jface.internal.text.html.SingleCharReader; // packageimport | |
21 import dwtx.jface.internal.text.html.BrowserInformationControlInput; // packageimport | |
22 import dwtx.jface.internal.text.html.HTMLMessages; // packageimport | |
23 | |
24 | |
129 | 25 import dwt.dwthelper.utils; |
26 | |
27 import java.io.IOException; | |
28 import java.io.PushbackReader; | |
29 import java.io.Reader; | |
30 import java.util.HashMap; | |
31 import java.util.HashSet; | |
32 import java.util.Map; | |
33 import java.util.Set; | |
34 | |
35 import dwt.DWT; | |
36 import dwt.custom.StyleRange; | |
37 import dwtx.jface.text.TextPresentation; | |
38 | |
39 | |
40 /** | |
41 * Reads the text contents from a reader of HTML contents and translates | |
42 * the tags or cut them out. | |
43 * <p> | |
44 * Moved into this package from <code>dwtx.jface.internal.text.revisions</code>.</p> | |
45 */ | |
46 public class HTML2TextReader : SubstitutionTextReader { | |
47 | |
48 private static final String EMPTY_STRING= ""; //$NON-NLS-1$ | |
49 private static final Map fgEntityLookup; | |
50 private static final Set fgTags; | |
51 | |
52 static { | |
53 | |
54 fgTags= new HashSet(); | |
55 fgTags.add("b"); //$NON-NLS-1$ | |
56 fgTags.add("br"); //$NON-NLS-1$ | |
57 fgTags.add("br/"); //$NON-NLS-1$ | |
58 fgTags.add("div"); //$NON-NLS-1$ | |
59 fgTags.add("h1"); //$NON-NLS-1$ | |
60 fgTags.add("h2"); //$NON-NLS-1$ | |
61 fgTags.add("h3"); //$NON-NLS-1$ | |
62 fgTags.add("h4"); //$NON-NLS-1$ | |
63 fgTags.add("h5"); //$NON-NLS-1$ | |
64 fgTags.add("p"); //$NON-NLS-1$ | |
65 fgTags.add("dl"); //$NON-NLS-1$ | |
66 fgTags.add("dt"); //$NON-NLS-1$ | |
67 fgTags.add("dd"); //$NON-NLS-1$ | |
68 fgTags.add("li"); //$NON-NLS-1$ | |
69 fgTags.add("ul"); //$NON-NLS-1$ | |
70 fgTags.add("pre"); //$NON-NLS-1$ | |
71 fgTags.add("head"); //$NON-NLS-1$ | |
72 | |
73 fgEntityLookup= new HashMap(7); | |
74 fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$ | |
75 fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$ | |
76 fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$ | |
77 fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$ | |
78 fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$ | |
79 fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$ | |
80 fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$ | |
81 } | |
82 | |
83 private int fCounter= 0; | |
84 private TextPresentation fTextPresentation; | |
85 private int fBold= 0; | |
86 private int fStartOffset= -1; | |
87 private bool fInParagraph= false; | |
88 private bool fIsPreformattedText= false; | |
89 private bool fIgnore= false; | |
90 private bool fHeaderDetected= false; | |
91 | |
92 /** | |
93 * Transforms the HTML text from the reader to formatted text. | |
94 * | |
95 * @param reader the reader | |
96 * @param presentation If not <code>null</code>, formattings will be applied to | |
97 * the presentation. | |
98 */ | |
133
7d818bd32d63
Fix ctors to this with gvim regexp
Frank Benoit <benoit@tionex.de>
parents:
131
diff
changeset
|
99 public this(Reader reader, TextPresentation presentation) { |
129 | 100 super(new PushbackReader(reader)); |
101 fTextPresentation= presentation; | |
102 } | |
103 | |
104 public int read() throws IOException { | |
105 int c= super.read(); | |
106 if (c !is -1) | |
107 ++ fCounter; | |
108 return c; | |
109 } | |
110 | |
111 protected void startBold() { | |
112 if (fBold is 0) | |
113 fStartOffset= fCounter; | |
114 ++ fBold; | |
115 } | |
116 | |
117 protected void startPreformattedText() { | |
118 fIsPreformattedText= true; | |
119 setSkipWhitespace(false); | |
120 } | |
121 | |
122 protected void stopPreformattedText() { | |
123 fIsPreformattedText= false; | |
124 setSkipWhitespace(true); | |
125 } | |
126 | |
127 protected void stopBold() { | |
128 -- fBold; | |
129 if (fBold is 0) { | |
130 if (fTextPresentation !is null) { | |
131 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, DWT.BOLD)); | |
132 } | |
133 fStartOffset= -1; | |
134 } | |
135 } | |
136 | |
137 /* | |
138 * @see dwtx.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int) | |
139 */ | |
140 protected String computeSubstitution(int c) throws IOException { | |
141 | |
142 if (c is '<') | |
143 return processHTMLTag(); | |
144 else if (fIgnore) | |
145 return EMPTY_STRING; | |
146 else if (c is '&') | |
147 return processEntity(); | |
148 else if (fIsPreformattedText) | |
149 return processPreformattedText(c); | |
150 | |
151 return null; | |
152 } | |
153 | |
154 private String html2Text(String html) { | |
155 | |
156 if (html is null || html.length() is 0) | |
157 return EMPTY_STRING; | |
158 | |
159 html= html.toLowerCase(); | |
160 | |
161 String tag= html; | |
162 if ('/' is tag.charAt(0)) | |
163 tag= tag.substring(1); | |
164 | |
165 if (!fgTags.contains(tag)) | |
166 return EMPTY_STRING; | |
167 | |
168 | |
169 if ("pre".equals(html)) { //$NON-NLS-1$ | |
170 startPreformattedText(); | |
171 return EMPTY_STRING; | |
172 } | |
173 | |
174 if ("/pre".equals(html)) { //$NON-NLS-1$ | |
175 stopPreformattedText(); | |
176 return EMPTY_STRING; | |
177 } | |
178 | |
179 if (fIsPreformattedText) | |
180 return EMPTY_STRING; | |
181 | |
182 if ("b".equals(html)) { //$NON-NLS-1$ | |
183 startBold(); | |
184 return EMPTY_STRING; | |
185 } | |
186 | |
187 if ((html.length() > 1 && html.charAt(0) is 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$ | |
188 startBold(); | |
189 return EMPTY_STRING; | |
190 } | |
191 | |
192 if ("dl".equals(html)) //$NON-NLS-1$ | |
193 return LINE_DELIM; | |
194 | |
195 if ("dd".equals(html)) //$NON-NLS-1$ | |
196 return "\t"; //$NON-NLS-1$ | |
197 | |
198 if ("li".equals(html)) //$NON-NLS-1$ | |
199 // FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682 | |
200 return LINE_DELIM + HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$ | |
201 | |
202 if ("/b".equals(html)) { //$NON-NLS-1$ | |
203 stopBold(); | |
204 return EMPTY_STRING; | |
205 } | |
206 | |
207 if ("p".equals(html)) { //$NON-NLS-1$ | |
208 fInParagraph= true; | |
209 return LINE_DELIM; | |
210 } | |
211 | |
212 if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ | |
213 return LINE_DELIM; | |
214 | |
215 if ("/p".equals(html)) { //$NON-NLS-1$ | |
216 bool inParagraph= fInParagraph; | |
217 fInParagraph= false; | |
218 return inParagraph ? EMPTY_STRING : LINE_DELIM; | |
219 } | |
220 | |
221 if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ | |
222 stopBold(); | |
223 return LINE_DELIM; | |
224 } | |
225 | |
226 if ("/dd".equals(html)) //$NON-NLS-1$ | |
227 return LINE_DELIM; | |
228 | |
229 if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$ | |
230 fHeaderDetected= true; | |
231 fIgnore= true; | |
232 return EMPTY_STRING; | |
233 } | |
234 | |
235 if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$ | |
236 fIgnore= false; | |
237 return EMPTY_STRING; | |
238 } | |
239 | |
240 return EMPTY_STRING; | |
241 } | |
242 | |
243 /* | |
244 * A '<' has been read. Process a html tag | |
245 */ | |
246 private String processHTMLTag() throws IOException { | |
247 | |
248 StringBuffer buf= new StringBuffer(); | |
249 int ch; | |
250 do { | |
251 | |
252 ch= nextChar(); | |
253 | |
254 while (ch !is -1 && ch !is '>') { | |
255 buf.append(Character.toLowerCase((char) ch)); | |
256 ch= nextChar(); | |
257 if (ch is '"'){ | |
258 buf.append(Character.toLowerCase((char) ch)); | |
259 ch= nextChar(); | |
260 while (ch !is -1 && ch !is '"'){ | |
261 buf.append(Character.toLowerCase((char) ch)); | |
262 ch= nextChar(); | |
263 } | |
264 } | |
265 if (ch is '<' && !isInComment(buf)) { | |
266 unread(ch); | |
267 return '<' + buf.toString(); | |
268 } | |
269 } | |
270 | |
271 if (ch is -1) | |
272 return null; | |
273 | |
274 if (!isInComment(buf) || isCommentEnd(buf)) { | |
275 break; | |
276 } | |
277 // unfinished comment | |
278 buf.append((char) ch); | |
279 } while (true); | |
280 | |
281 return html2Text(buf.toString()); | |
282 } | |
283 | |
284 private static bool isInComment(StringBuffer buf) { | |
285 return buf.length() >= 3 && "!--".equals(buf.substring(0, 3)); //$NON-NLS-1$ | |
286 } | |
287 | |
288 private static bool isCommentEnd(StringBuffer buf) { | |
289 int tagLen= buf.length(); | |
290 return tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)); //$NON-NLS-1$ | |
291 } | |
292 | |
293 private String processPreformattedText(int c) { | |
294 if (c is '\r' || c is '\n') | |
295 fCounter++; | |
296 return null; | |
297 } | |
298 | |
299 | |
300 private void unread(int ch) throws IOException { | |
301 ((PushbackReader) getReader()).unread(ch); | |
302 } | |
303 | |
304 protected String entity2Text(String symbol) { | |
305 if (symbol.length() > 1 && symbol.charAt(0) is '#') { | |
306 int ch; | |
307 try { | |
308 if (symbol.charAt(1) is 'x') { | |
309 ch= Integer.parseInt(symbol.substring(2), 16); | |
310 } else { | |
311 ch= Integer.parseInt(symbol.substring(1), 10); | |
312 } | |
313 return EMPTY_STRING + (char)ch; | |
314 } catch (NumberFormatException e) { | |
315 } | |
316 } else { | |
317 String str= (String) fgEntityLookup.get(symbol); | |
318 if (str !is null) { | |
319 return str; | |
320 } | |
321 } | |
322 return "&" + symbol; // not found //$NON-NLS-1$ | |
323 } | |
324 | |
325 /* | |
326 * A '&' has been read. Process a entity | |
327 */ | |
328 private String processEntity() throws IOException { | |
329 StringBuffer buf= new StringBuffer(); | |
330 int ch= nextChar(); | |
331 while (Character.isLetterOrDigit((char)ch) || ch is '#') { | |
332 buf.append((char) ch); | |
333 ch= nextChar(); | |
334 } | |
335 | |
336 if (ch is ';') | |
337 return entity2Text(buf.toString()); | |
338 | |
339 buf.insert(0, '&'); | |
340 if (ch !is -1) | |
341 buf.append((char) ch); | |
342 return buf.toString(); | |
343 } | |
344 } |