Mercurial > projects > dwt-addons
comparison dwtx/jface/internal/text/html/HTML2TextReader.d @ 129:eb30df5ca28b
Added JFace Text sources
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Sat, 23 Aug 2008 19:10:48 +0200 |
parents | |
children | c4fb132a086c |
comparison
equal
deleted
inserted
replaced
128:8df1d4193877 | 129:eb30df5ca28b |
---|---|
1 /******************************************************************************* | |
2 * Copyright (c) 2000, 2008 IBM Corporation and others. | |
3 * All rights reserved. This program and the accompanying materials | |
4 * are made available under the terms of the Eclipse Public License v1.0 | |
5 * which accompanies this distribution, and is available at | |
6 * http://www.eclipse.org/legal/epl-v10.html | |
7 * | |
8 * Contributors: | |
9 * IBM Corporation - initial API and implementation | |
10 * Port to the D programming language: | |
11 * Frank Benoit <benoit@tionex.de> | |
12 *******************************************************************************/ | |
13 module dwtx.jface.internal.text.html.HTML2TextReader; | |
14 | |
15 import dwt.dwthelper.utils; | |
16 | |
17 import java.io.IOException; | |
18 import java.io.PushbackReader; | |
19 import java.io.Reader; | |
20 import java.util.HashMap; | |
21 import java.util.HashSet; | |
22 import java.util.Map; | |
23 import java.util.Set; | |
24 | |
25 import dwt.DWT; | |
26 import dwt.custom.StyleRange; | |
27 import dwtx.jface.text.TextPresentation; | |
28 | |
29 | |
30 /** | |
31 * Reads the text contents from a reader of HTML contents and translates | |
32 * the tags or cut them out. | |
33 * <p> | |
34 * Moved into this package from <code>dwtx.jface.internal.text.revisions</code>.</p> | |
35 */ | |
36 public class HTML2TextReader : SubstitutionTextReader { | |
37 | |
38 private static final String EMPTY_STRING= ""; //$NON-NLS-1$ | |
39 private static final Map fgEntityLookup; | |
40 private static final Set fgTags; | |
41 | |
42 static { | |
43 | |
44 fgTags= new HashSet(); | |
45 fgTags.add("b"); //$NON-NLS-1$ | |
46 fgTags.add("br"); //$NON-NLS-1$ | |
47 fgTags.add("br/"); //$NON-NLS-1$ | |
48 fgTags.add("div"); //$NON-NLS-1$ | |
49 fgTags.add("h1"); //$NON-NLS-1$ | |
50 fgTags.add("h2"); //$NON-NLS-1$ | |
51 fgTags.add("h3"); //$NON-NLS-1$ | |
52 fgTags.add("h4"); //$NON-NLS-1$ | |
53 fgTags.add("h5"); //$NON-NLS-1$ | |
54 fgTags.add("p"); //$NON-NLS-1$ | |
55 fgTags.add("dl"); //$NON-NLS-1$ | |
56 fgTags.add("dt"); //$NON-NLS-1$ | |
57 fgTags.add("dd"); //$NON-NLS-1$ | |
58 fgTags.add("li"); //$NON-NLS-1$ | |
59 fgTags.add("ul"); //$NON-NLS-1$ | |
60 fgTags.add("pre"); //$NON-NLS-1$ | |
61 fgTags.add("head"); //$NON-NLS-1$ | |
62 | |
63 fgEntityLookup= new HashMap(7); | |
64 fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$ | |
65 fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$ | |
66 fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$ | |
67 fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$ | |
68 fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$ | |
69 fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$ | |
70 fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$ | |
71 } | |
72 | |
73 private int fCounter= 0; | |
74 private TextPresentation fTextPresentation; | |
75 private int fBold= 0; | |
76 private int fStartOffset= -1; | |
77 private bool fInParagraph= false; | |
78 private bool fIsPreformattedText= false; | |
79 private bool fIgnore= false; | |
80 private bool fHeaderDetected= false; | |
81 | |
82 /** | |
83 * Transforms the HTML text from the reader to formatted text. | |
84 * | |
85 * @param reader the reader | |
86 * @param presentation If not <code>null</code>, formattings will be applied to | |
87 * the presentation. | |
88 */ | |
89 public HTML2TextReader(Reader reader, TextPresentation presentation) { | |
90 super(new PushbackReader(reader)); | |
91 fTextPresentation= presentation; | |
92 } | |
93 | |
94 public int read() throws IOException { | |
95 int c= super.read(); | |
96 if (c !is -1) | |
97 ++ fCounter; | |
98 return c; | |
99 } | |
100 | |
101 protected void startBold() { | |
102 if (fBold is 0) | |
103 fStartOffset= fCounter; | |
104 ++ fBold; | |
105 } | |
106 | |
107 protected void startPreformattedText() { | |
108 fIsPreformattedText= true; | |
109 setSkipWhitespace(false); | |
110 } | |
111 | |
112 protected void stopPreformattedText() { | |
113 fIsPreformattedText= false; | |
114 setSkipWhitespace(true); | |
115 } | |
116 | |
117 protected void stopBold() { | |
118 -- fBold; | |
119 if (fBold is 0) { | |
120 if (fTextPresentation !is null) { | |
121 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, DWT.BOLD)); | |
122 } | |
123 fStartOffset= -1; | |
124 } | |
125 } | |
126 | |
127 /* | |
128 * @see dwtx.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int) | |
129 */ | |
130 protected String computeSubstitution(int c) throws IOException { | |
131 | |
132 if (c is '<') | |
133 return processHTMLTag(); | |
134 else if (fIgnore) | |
135 return EMPTY_STRING; | |
136 else if (c is '&') | |
137 return processEntity(); | |
138 else if (fIsPreformattedText) | |
139 return processPreformattedText(c); | |
140 | |
141 return null; | |
142 } | |
143 | |
144 private String html2Text(String html) { | |
145 | |
146 if (html is null || html.length() is 0) | |
147 return EMPTY_STRING; | |
148 | |
149 html= html.toLowerCase(); | |
150 | |
151 String tag= html; | |
152 if ('/' is tag.charAt(0)) | |
153 tag= tag.substring(1); | |
154 | |
155 if (!fgTags.contains(tag)) | |
156 return EMPTY_STRING; | |
157 | |
158 | |
159 if ("pre".equals(html)) { //$NON-NLS-1$ | |
160 startPreformattedText(); | |
161 return EMPTY_STRING; | |
162 } | |
163 | |
164 if ("/pre".equals(html)) { //$NON-NLS-1$ | |
165 stopPreformattedText(); | |
166 return EMPTY_STRING; | |
167 } | |
168 | |
169 if (fIsPreformattedText) | |
170 return EMPTY_STRING; | |
171 | |
172 if ("b".equals(html)) { //$NON-NLS-1$ | |
173 startBold(); | |
174 return EMPTY_STRING; | |
175 } | |
176 | |
177 if ((html.length() > 1 && html.charAt(0) is 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$ | |
178 startBold(); | |
179 return EMPTY_STRING; | |
180 } | |
181 | |
182 if ("dl".equals(html)) //$NON-NLS-1$ | |
183 return LINE_DELIM; | |
184 | |
185 if ("dd".equals(html)) //$NON-NLS-1$ | |
186 return "\t"; //$NON-NLS-1$ | |
187 | |
188 if ("li".equals(html)) //$NON-NLS-1$ | |
189 // FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682 | |
190 return LINE_DELIM + HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$ | |
191 | |
192 if ("/b".equals(html)) { //$NON-NLS-1$ | |
193 stopBold(); | |
194 return EMPTY_STRING; | |
195 } | |
196 | |
197 if ("p".equals(html)) { //$NON-NLS-1$ | |
198 fInParagraph= true; | |
199 return LINE_DELIM; | |
200 } | |
201 | |
202 if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ | |
203 return LINE_DELIM; | |
204 | |
205 if ("/p".equals(html)) { //$NON-NLS-1$ | |
206 bool inParagraph= fInParagraph; | |
207 fInParagraph= false; | |
208 return inParagraph ? EMPTY_STRING : LINE_DELIM; | |
209 } | |
210 | |
211 if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ | |
212 stopBold(); | |
213 return LINE_DELIM; | |
214 } | |
215 | |
216 if ("/dd".equals(html)) //$NON-NLS-1$ | |
217 return LINE_DELIM; | |
218 | |
219 if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$ | |
220 fHeaderDetected= true; | |
221 fIgnore= true; | |
222 return EMPTY_STRING; | |
223 } | |
224 | |
225 if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$ | |
226 fIgnore= false; | |
227 return EMPTY_STRING; | |
228 } | |
229 | |
230 return EMPTY_STRING; | |
231 } | |
232 | |
233 /* | |
234 * A '<' has been read. Process a html tag | |
235 */ | |
236 private String processHTMLTag() throws IOException { | |
237 | |
238 StringBuffer buf= new StringBuffer(); | |
239 int ch; | |
240 do { | |
241 | |
242 ch= nextChar(); | |
243 | |
244 while (ch !is -1 && ch !is '>') { | |
245 buf.append(Character.toLowerCase((char) ch)); | |
246 ch= nextChar(); | |
247 if (ch is '"'){ | |
248 buf.append(Character.toLowerCase((char) ch)); | |
249 ch= nextChar(); | |
250 while (ch !is -1 && ch !is '"'){ | |
251 buf.append(Character.toLowerCase((char) ch)); | |
252 ch= nextChar(); | |
253 } | |
254 } | |
255 if (ch is '<' && !isInComment(buf)) { | |
256 unread(ch); | |
257 return '<' + buf.toString(); | |
258 } | |
259 } | |
260 | |
261 if (ch is -1) | |
262 return null; | |
263 | |
264 if (!isInComment(buf) || isCommentEnd(buf)) { | |
265 break; | |
266 } | |
267 // unfinished comment | |
268 buf.append((char) ch); | |
269 } while (true); | |
270 | |
271 return html2Text(buf.toString()); | |
272 } | |
273 | |
274 private static bool isInComment(StringBuffer buf) { | |
275 return buf.length() >= 3 && "!--".equals(buf.substring(0, 3)); //$NON-NLS-1$ | |
276 } | |
277 | |
278 private static bool isCommentEnd(StringBuffer buf) { | |
279 int tagLen= buf.length(); | |
280 return tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)); //$NON-NLS-1$ | |
281 } | |
282 | |
283 private String processPreformattedText(int c) { | |
284 if (c is '\r' || c is '\n') | |
285 fCounter++; | |
286 return null; | |
287 } | |
288 | |
289 | |
290 private void unread(int ch) throws IOException { | |
291 ((PushbackReader) getReader()).unread(ch); | |
292 } | |
293 | |
294 protected String entity2Text(String symbol) { | |
295 if (symbol.length() > 1 && symbol.charAt(0) is '#') { | |
296 int ch; | |
297 try { | |
298 if (symbol.charAt(1) is 'x') { | |
299 ch= Integer.parseInt(symbol.substring(2), 16); | |
300 } else { | |
301 ch= Integer.parseInt(symbol.substring(1), 10); | |
302 } | |
303 return EMPTY_STRING + (char)ch; | |
304 } catch (NumberFormatException e) { | |
305 } | |
306 } else { | |
307 String str= (String) fgEntityLookup.get(symbol); | |
308 if (str !is null) { | |
309 return str; | |
310 } | |
311 } | |
312 return "&" + symbol; // not found //$NON-NLS-1$ | |
313 } | |
314 | |
315 /* | |
316 * A '&' has been read. Process a entity | |
317 */ | |
318 private String processEntity() throws IOException { | |
319 StringBuffer buf= new StringBuffer(); | |
320 int ch= nextChar(); | |
321 while (Character.isLetterOrDigit((char)ch) || ch is '#') { | |
322 buf.append((char) ch); | |
323 ch= nextChar(); | |
324 } | |
325 | |
326 if (ch is ';') | |
327 return entity2Text(buf.toString()); | |
328 | |
329 buf.insert(0, '&'); | |
330 if (ch !is -1) | |
331 buf.append((char) ch); | |
332 return buf.toString(); | |
333 } | |
334 } |