Mercurial > projects > dwt2
comparison org.eclipse.jface.text/src/org/eclipse/jface/internal/text/html/HTML2TextReader.d @ 12:bc29606a740c
Added dwt-addons in original directory structure of eclipse.org
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Sat, 14 Mar 2009 18:23:29 +0100 |
parents | |
children | dbfb303e8fb0 |
comparison
equal
deleted
inserted
replaced
11:43904fec5dca | 12:bc29606a740c |
---|---|
1 /******************************************************************************* | |
2 * Copyright (c) 2000, 2008 IBM Corporation and others. | |
3 * All rights reserved. This program and the accompanying materials | |
4 * are made available under the terms of the Eclipse Public License v1.0 | |
5 * which accompanies this distribution, and is available at | |
6 * http://www.eclipse.org/legal/epl-v10.html | |
7 * | |
8 * Contributors: | |
9 * IBM Corporation - initial API and implementation | |
10 * Port to the D programming language: | |
11 * Frank Benoit <benoit@tionex.de> | |
12 *******************************************************************************/ | |
13 module org.eclipse.jface.internal.text.html.HTML2TextReader; | |
14 | |
15 import org.eclipse.jface.internal.text.html.HTMLPrinter; // packageimport | |
16 import org.eclipse.jface.internal.text.html.BrowserInformationControl; // packageimport | |
17 import org.eclipse.jface.internal.text.html.SubstitutionTextReader; // packageimport | |
18 import org.eclipse.jface.internal.text.html.HTMLTextPresenter; // packageimport | |
19 import org.eclipse.jface.internal.text.html.BrowserInput; // packageimport | |
20 import org.eclipse.jface.internal.text.html.SingleCharReader; // packageimport | |
21 import org.eclipse.jface.internal.text.html.BrowserInformationControlInput; // packageimport | |
22 import org.eclipse.jface.internal.text.html.HTMLMessages; // packageimport | |
23 | |
24 import java.lang.all; | |
25 import java.util.Map; | |
26 import java.util.HashMap; | |
27 import java.util.Set; | |
28 import java.util.HashSet; | |
29 import org.eclipse.dwtxhelper.PushbackReader; | |
30 static import tango.text.convert.Utf; | |
31 | |
32 import org.eclipse.swt.SWT; | |
33 import org.eclipse.swt.custom.StyleRange; | |
34 import org.eclipse.jface.text.TextPresentation; | |
35 | |
36 | |
37 /** | |
38 * Reads the text contents from a reader of HTML contents and translates | |
39 * the tags or cut them out. | |
40 * <p> | |
41 * Moved into this package from <code>org.eclipse.jface.internal.text.revisions</code>.</p> | |
42 */ | |
43 public class HTML2TextReader : SubstitutionTextReader { | |
44 | |
45 private static const String EMPTY_STRING= ""; //$NON-NLS-1$ | |
46 private static Map fgEntityLookup_; | |
47 private static Set fgTags_; | |
48 private static Map fgEntityLookup(){ | |
49 if( fgEntityLookup_ is null ){ | |
50 synchronized(HTML2TextReader.classinfo ){ | |
51 if( fgEntityLookup_ is null ){ | |
52 fgEntityLookup_= new HashMap(7); | |
53 fgEntityLookup_.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$ | |
54 fgEntityLookup_.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$ | |
55 fgEntityLookup_.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$ | |
56 fgEntityLookup_.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$ | |
57 fgEntityLookup_.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$ | |
58 fgEntityLookup_.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$ | |
59 fgEntityLookup_.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$ | |
60 } | |
61 } | |
62 } | |
63 return fgEntityLookup_; | |
64 } | |
65 private static Set fgTags(){ | |
66 if( fgTags_ is null ){ | |
67 synchronized(HTML2TextReader.classinfo ){ | |
68 if( fgTags_ is null ){ | |
69 fgTags_= new HashSet(); | |
70 fgTags_.add("b"); //$NON-NLS-1$ | |
71 fgTags_.add("br"); //$NON-NLS-1$ | |
72 fgTags_.add("br/"); //$NON-NLS-1$ | |
73 fgTags_.add("div"); //$NON-NLS-1$ | |
74 fgTags_.add("h1"); //$NON-NLS-1$ | |
75 fgTags_.add("h2"); //$NON-NLS-1$ | |
76 fgTags_.add("h3"); //$NON-NLS-1$ | |
77 fgTags_.add("h4"); //$NON-NLS-1$ | |
78 fgTags_.add("h5"); //$NON-NLS-1$ | |
79 fgTags_.add("p"); //$NON-NLS-1$ | |
80 fgTags_.add("dl"); //$NON-NLS-1$ | |
81 fgTags_.add("dt"); //$NON-NLS-1$ | |
82 fgTags_.add("dd"); //$NON-NLS-1$ | |
83 fgTags_.add("li"); //$NON-NLS-1$ | |
84 fgTags_.add("ul"); //$NON-NLS-1$ | |
85 fgTags_.add("pre"); //$NON-NLS-1$ | |
86 fgTags_.add("head"); //$NON-NLS-1$ | |
87 } | |
88 } | |
89 } | |
90 return fgTags_; | |
91 } | |
92 | |
93 private int fCounter= 0; | |
94 private TextPresentation fTextPresentation; | |
95 private int fBold= 0; | |
96 private int fStartOffset= -1; | |
97 private bool fInParagraph= false; | |
98 private bool fIsPreformattedText= false; | |
99 private bool fIgnore= false; | |
100 private bool fHeaderDetected= false; | |
101 | |
102 /** | |
103 * Transforms the HTML text from the reader to formatted text. | |
104 * | |
105 * @param reader the reader | |
106 * @param presentation If not <code>null</code>, formattings will be applied to | |
107 * the presentation. | |
108 */ | |
109 public this(Reader reader, TextPresentation presentation) { | |
110 super(new PushbackReader(reader)); | |
111 fTextPresentation= presentation; | |
112 } | |
113 | |
114 public int read() { | |
115 int c= super.read(); | |
116 if (c !is -1) | |
117 ++ fCounter; | |
118 return c; | |
119 } | |
120 | |
121 protected void startBold() { | |
122 if (fBold is 0) | |
123 fStartOffset= fCounter; | |
124 ++ fBold; | |
125 } | |
126 | |
127 protected void startPreformattedText() { | |
128 fIsPreformattedText= true; | |
129 setSkipWhitespace(false); | |
130 } | |
131 | |
132 protected void stopPreformattedText() { | |
133 fIsPreformattedText= false; | |
134 setSkipWhitespace(true); | |
135 } | |
136 | |
137 protected void stopBold() { | |
138 -- fBold; | |
139 if (fBold is 0) { | |
140 if (fTextPresentation !is null) { | |
141 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, SWT.BOLD)); | |
142 } | |
143 fStartOffset= -1; | |
144 } | |
145 } | |
146 | |
147 /* | |
148 * @see org.eclipse.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int) | |
149 */ | |
150 protected String computeSubstitution(int c) { | |
151 | |
152 if (c is '<') | |
153 return processHTMLTag(); | |
154 else if (fIgnore) | |
155 return EMPTY_STRING; | |
156 else if (c is '&') | |
157 return processEntity(); | |
158 else if (fIsPreformattedText) | |
159 return processPreformattedText(c); | |
160 | |
161 return null; | |
162 } | |
163 | |
164 private String html2Text(String html) { | |
165 | |
166 if (html is null || html.length() is 0) | |
167 return EMPTY_STRING; | |
168 | |
169 html= html.toLowerCase(); | |
170 | |
171 String tag= html; | |
172 if ('/' is tag.charAt(0)) | |
173 tag= tag.substring(1); | |
174 | |
175 if (!fgTags.contains(tag)) | |
176 return EMPTY_STRING; | |
177 | |
178 | |
179 if ("pre".equals(html)) { //$NON-NLS-1$ | |
180 startPreformattedText(); | |
181 return EMPTY_STRING; | |
182 } | |
183 | |
184 if ("/pre".equals(html)) { //$NON-NLS-1$ | |
185 stopPreformattedText(); | |
186 return EMPTY_STRING; | |
187 } | |
188 | |
189 if (fIsPreformattedText) | |
190 return EMPTY_STRING; | |
191 | |
192 if ("b".equals(html)) { //$NON-NLS-1$ | |
193 startBold(); | |
194 return EMPTY_STRING; | |
195 } | |
196 | |
197 if ((html.length() > 1 && html.charAt(0) is 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$ | |
198 startBold(); | |
199 return EMPTY_STRING; | |
200 } | |
201 | |
202 if ("dl".equals(html)) //$NON-NLS-1$ | |
203 return LINE_DELIM; | |
204 | |
205 if ("dd".equals(html)) //$NON-NLS-1$ | |
206 return "\t"; //$NON-NLS-1$ | |
207 | |
208 if ("li".equals(html)) //$NON-NLS-1$ | |
209 // FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682 | |
210 return LINE_DELIM ~ HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$ | |
211 | |
212 if ("/b".equals(html)) { //$NON-NLS-1$ | |
213 stopBold(); | |
214 return EMPTY_STRING; | |
215 } | |
216 | |
217 if ("p".equals(html)) { //$NON-NLS-1$ | |
218 fInParagraph= true; | |
219 return LINE_DELIM; | |
220 } | |
221 | |
222 if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ | |
223 return LINE_DELIM; | |
224 | |
225 if ("/p".equals(html)) { //$NON-NLS-1$ | |
226 bool inParagraph= fInParagraph; | |
227 fInParagraph= false; | |
228 return inParagraph ? EMPTY_STRING : LINE_DELIM; | |
229 } | |
230 | |
231 if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$ | |
232 stopBold(); | |
233 return LINE_DELIM; | |
234 } | |
235 | |
236 if ("/dd".equals(html)) //$NON-NLS-1$ | |
237 return LINE_DELIM; | |
238 | |
239 if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$ | |
240 fHeaderDetected= true; | |
241 fIgnore= true; | |
242 return EMPTY_STRING; | |
243 } | |
244 | |
245 if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$ | |
246 fIgnore= false; | |
247 return EMPTY_STRING; | |
248 } | |
249 | |
250 return EMPTY_STRING; | |
251 } | |
252 | |
253 /* | |
254 * A '<' has been read. Process a html tag | |
255 */ | |
256 private String processHTMLTag() { | |
257 | |
258 StringBuffer buf= new StringBuffer(); | |
259 int ch; | |
260 do { | |
261 | |
262 ch= nextDChar(); | |
263 | |
264 while (ch !is -1 && ch !is '>') { | |
265 buf.append(dcharToString(Character.toLowerCase(cast(dchar) ch))); | |
266 ch= nextDChar(); | |
267 if (ch is '"'){ | |
268 buf.append(dcharToString(Character.toLowerCase(cast(dchar) ch))); | |
269 ch= nextDChar(); | |
270 while (ch !is -1 && ch !is '"'){ | |
271 buf.append(dcharToString(Character.toLowerCase(cast(dchar) ch))); | |
272 ch= nextDChar(); | |
273 } | |
274 } | |
275 if (ch is '<' && !isInComment(buf)) { | |
276 unreadDChar(ch); | |
277 return '<' ~ buf.toString(); | |
278 } | |
279 } | |
280 | |
281 if (ch is -1) | |
282 return null; | |
283 | |
284 if (!isInComment(buf) || isCommentEnd(buf)) { | |
285 break; | |
286 } | |
287 // unfinished comment | |
288 buf.append(dcharToString(cast(dchar) ch)); | |
289 } while (true); | |
290 | |
291 return html2Text(buf.toString()); | |
292 } | |
293 | |
294 private static bool isInComment(StringBuffer buf) { | |
295 return buf.length() >= 3 && "!--".equals(buf.slice().substring(0, 3)); //$NON-NLS-1$ | |
296 } | |
297 | |
298 private static bool isCommentEnd(StringBuffer buf) { | |
299 int tagLen= buf.length(); | |
300 return tagLen >= 5 && "--".equals(buf.slice().substring(tagLen - 2)); //$NON-NLS-1$ | |
301 } | |
302 | |
303 private String processPreformattedText(int c) { | |
304 if (c is '\r' || c is '\n') | |
305 fCounter++; | |
306 return null; | |
307 } | |
308 | |
309 | |
310 private void unreadDChar(dchar ch) { | |
311 char[4] buf; | |
312 dchar[1] ibuf; | |
313 ibuf[0] = ch; | |
314 foreach( char c; tango.text.convert.Utf.toString( ibuf[], buf[] )){ | |
315 (cast(PushbackReader) getReader()).unread(c); | |
316 } | |
317 } | |
318 | |
319 protected String entity2Text(String symbol) { | |
320 if (symbol.length() > 1 && symbol.charAt(0) is '#') { | |
321 int ch; | |
322 try { | |
323 if (symbol.charAt(1) is 'x') { | |
324 ch= Integer.parseInt(symbol.substring(2), 16); | |
325 } else { | |
326 ch= Integer.parseInt(symbol.substring(1), 10); | |
327 } | |
328 return dcharToString( cast(dchar)ch); | |
329 } catch (NumberFormatException e) { | |
330 } | |
331 } else { | |
332 String str= stringcast( fgEntityLookup.get(symbol)); | |
333 if (str !is null) { | |
334 return str; | |
335 } | |
336 } | |
337 return "&" ~ symbol; // not found //$NON-NLS-1$ | |
338 } | |
339 | |
340 /* | |
341 * A '&' has been read. Process a entity | |
342 */ | |
343 private String processEntity() { | |
344 StringBuffer buf= new StringBuffer(); | |
345 int ch= nextDChar(); | |
346 while (Character.isLetterOrDigit(cast(dchar)ch) || ch is '#') { | |
347 buf.append(dcharToString(cast(dchar) ch)); | |
348 ch= nextDChar(); | |
349 } | |
350 | |
351 if (ch is ';') | |
352 return entity2Text(buf.toString()); | |
353 | |
354 buf.select(0, 0); | |
355 buf.prepend("&"); | |
356 if (ch !is -1) | |
357 buf.append(dcharToString(cast(dchar) ch)); | |
358 return buf.toString(); | |
359 } | |
360 } |