1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
|
21 | |
|
22 | |
|
23 | |
|
24 | |
|
25 | |
|
26 | |
|
27 | |
|
28 | |
|
29 | |
|
30 | |
|
31 | |
|
32 | |
|
33 | |
|
34 | |
package net.sf.jmatchparser.util.charset.jchardet; |
35 | |
|
36 | |
import java.io.ByteArrayOutputStream; |
37 | |
import java.nio.ByteBuffer; |
38 | |
import java.nio.CharBuffer; |
39 | |
import java.nio.charset.Charset; |
40 | |
import java.nio.charset.CharsetDecoder; |
41 | |
import java.nio.charset.CharsetEncoder; |
42 | |
import java.nio.charset.CoderResult; |
43 | |
import java.nio.charset.UnsupportedCharsetException; |
44 | |
|
45 | |
import org.mozilla.intl.chardet.nsDetector; |
46 | |
import org.mozilla.intl.chardet.nsICharsetDetectionObserver; |
47 | |
|
48 | 61 | class JChardetCharset extends Charset { |
49 | |
|
50 | 1 | public static final String[] NAMES = new String[] { |
51 | |
"jChardet", |
52 | |
"jChardet-JAPANESE", |
53 | |
"jChardet-CHINESE", |
54 | |
"jChardet-SIMPLIFIED-CHINESE", |
55 | |
"jChardet-TRADITIONAL-CHINESE", |
56 | |
"jChardet-KOREAN", |
57 | |
}; |
58 | |
|
59 | 1 | private static final JChardetCharset[] instances = new JChardetCharset[NAMES.length]; |
60 | |
|
61 | |
protected static JChardetCharset getInstance(int languageFlag) { |
62 | 28 | if (instances[languageFlag] == null) { |
63 | 1 | instances[languageFlag] = new JChardetCharset(languageFlag); |
64 | |
} |
65 | 28 | return instances[languageFlag]; |
66 | |
} |
67 | |
|
68 | |
private final int languageFlag; |
69 | |
|
70 | |
private JChardetCharset(int languageFlag) { |
71 | 1 | super(NAMES[languageFlag], null); |
72 | 1 | this.languageFlag = languageFlag; |
73 | 1 | } |
74 | |
|
75 | |
@Override |
76 | |
public boolean contains(Charset cs) { |
77 | 0 | return cs == this; |
78 | |
} |
79 | |
|
80 | |
@Override |
81 | |
public CharsetDecoder newDecoder() { |
82 | 31 | return new Decoder(); |
83 | |
} |
84 | |
|
85 | |
@Override |
86 | |
public CharsetEncoder newEncoder() { |
87 | 0 | throw new UnsupportedOperationException(); |
88 | |
} |
89 | |
|
90 | |
@Override |
91 | |
public boolean canEncode() { |
92 | 0 | return false; |
93 | |
} |
94 | |
|
95 | |
private class Decoder extends CharsetDecoder implements nsICharsetDetectionObserver { |
96 | |
private nsDetector detector; |
97 | 31 | private ByteArrayOutputStream buffer = new ByteArrayOutputStream(); |
98 | 31 | private boolean isASCII = true; |
99 | |
|
100 | 31 | private CharsetDecoder usedDecoder = null; |
101 | 31 | private ByteBuffer remaining = null; |
102 | |
|
103 | 31 | protected Decoder() { |
104 | 31 | super(JChardetCharset.this, 1.0f, 2.0f); |
105 | 31 | detector = new nsDetector(languageFlag); |
106 | 31 | detector.Init(this); |
107 | 31 | } |
108 | |
|
109 | |
@Override |
110 | |
public void Notify(String charset) { |
111 | 0 | setCharset(charset); |
112 | 0 | } |
113 | |
|
114 | |
private void setCharset(String charset) { |
115 | |
try { |
116 | 30 | usedDecoder = Charset.forName(charset).newDecoder(); |
117 | 1 | } catch (UnsupportedCharsetException ex) { |
118 | 1 | usedDecoder = Charset.forName("ISO-8859-1").newDecoder(); |
119 | 29 | } |
120 | 30 | usedDecoder.onUnmappableCharacter(unmappableCharacterAction()); |
121 | 30 | usedDecoder.onMalformedInput(malformedInputAction()); |
122 | 30 | byte[] buf = buffer.toByteArray(); |
123 | 30 | if (buf.length > 0) { |
124 | 30 | remaining = ByteBuffer.allocate(buf.length); |
125 | 30 | remaining.put(buf); |
126 | 30 | remaining.flip(); |
127 | |
} |
128 | 30 | buffer = null; |
129 | 30 | detector = null; |
130 | 30 | isASCII = false; |
131 | 30 | } |
132 | |
|
133 | |
@Override |
134 | |
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { |
135 | 30 | byte[] buf = new byte[1024]; |
136 | |
while (true) { |
137 | 1056 | if (usedDecoder != null) { |
138 | 0 | if (remaining != null) { |
139 | 0 | CoderResult result = writeOut(out); |
140 | 0 | if (!result.isUnderflow()) |
141 | 0 | return result; |
142 | |
} |
143 | 0 | return usedDecoder.decode(in, out, false); |
144 | |
} |
145 | 1056 | if (in.remaining() == 0) |
146 | 30 | return CoderResult.UNDERFLOW; |
147 | 1026 | int len = Math.min(in.remaining(), buf.length); |
148 | 1026 | in.get(buf, 0, len); |
149 | 1026 | buffer.write(buf, 0, len); |
150 | 1026 | if (isASCII) |
151 | 324 | isASCII = detector.isAscii(buf, len); |
152 | |
|
153 | 1026 | if (!isASCII) { |
154 | 727 | boolean done = detector.DoIt(buf, len, false); |
155 | 727 | done = false; |
156 | 727 | if (done && usedDecoder == null) { |
157 | |
|
158 | 0 | setCharset("ISO-8859-1"); |
159 | |
} |
160 | |
} |
161 | 1026 | } |
162 | |
} |
163 | |
|
164 | |
@Override |
165 | |
protected CoderResult implFlush(CharBuffer out) { |
166 | 30 | if (usedDecoder == null) { |
167 | 30 | detector.DataEnd(); |
168 | |
} |
169 | 30 | if (usedDecoder == null) { |
170 | 30 | if (isASCII) { |
171 | 5 | setCharset("US-ASCII"); |
172 | |
} else { |
173 | 25 | String prob[] = detector.getProbableCharsets(); |
174 | 25 | if (prob.length > 0) { |
175 | 25 | setCharset(prob[0]); |
176 | |
} else { |
177 | |
|
178 | 0 | setCharset("ISO-8859-1"); |
179 | |
} |
180 | |
} |
181 | |
} |
182 | 30 | if (remaining != null) { |
183 | 30 | CoderResult result = writeOut(out); |
184 | 30 | if (!result.isUnderflow()) |
185 | 0 | return result; |
186 | |
} |
187 | 30 | if (!isASCII) { |
188 | 30 | ByteBuffer empty = ByteBuffer.allocate(1); |
189 | 30 | empty.flip(); |
190 | 30 | CoderResult result = usedDecoder.decode(empty, out, true); |
191 | 30 | if (!result.isUnderflow()) |
192 | 0 | return result; |
193 | 30 | result = usedDecoder.flush(out); |
194 | 30 | if (!result.isUnderflow()) |
195 | 0 | return result; |
196 | 30 | isASCII = true; |
197 | |
} |
198 | 30 | return super.implFlush(out); |
199 | |
} |
200 | |
|
201 | |
private CoderResult writeOut(CharBuffer out) { |
202 | 30 | CoderResult result = usedDecoder.decode(remaining, out, false); |
203 | 30 | if (remaining.remaining() == 0) |
204 | 30 | remaining = null; |
205 | 30 | return result; |
206 | |
} |
207 | |
|
208 | |
@Override |
209 | |
protected void implReset() { |
210 | 30 | detector = new nsDetector(languageFlag); |
211 | 30 | buffer = new ByteArrayOutputStream(); |
212 | 30 | usedDecoder = null; |
213 | 30 | remaining = null; |
214 | 30 | isASCII = true; |
215 | 30 | } |
216 | |
|
217 | |
@Override |
218 | |
public boolean isAutoDetecting() { |
219 | 0 | return true; |
220 | |
} |
221 | |
|
222 | |
@Override |
223 | |
public boolean isCharsetDetected() { |
224 | 0 | return usedDecoder != null; |
225 | |
} |
226 | |
|
227 | |
@Override |
228 | |
public Charset detectedCharset() { |
229 | 0 | return usedDecoder.charset(); |
230 | |
} |
231 | |
} |
232 | |
} |