1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
|
21 | |
|
22 | |
|
23 | |
|
24 | |
|
25 | |
|
26 | |
|
27 | |
|
28 | |
|
29 | |
|
30 | |
|
31 | |
|
32 | |
|
33 | |
package net.sf.jmatchparser.util.charset; |
34 | |
|
35 | |
import java.nio.ByteBuffer; |
36 | |
import java.nio.CharBuffer; |
37 | |
import java.nio.charset.Charset; |
38 | |
import java.nio.charset.CharsetDecoder; |
39 | |
import java.nio.charset.CharsetEncoder; |
40 | |
import java.nio.charset.CoderResult; |
41 | |
import java.nio.charset.CodingErrorAction; |
42 | |
|
43 | 11129251 | class UTF8BinaryCharset extends Charset { |
44 | |
|
45 | |
static final String SURROGATE_VARIANT = "UTF-8-Binary"; |
46 | |
static final String PUA_VARIANT = "UTF-8-Binary-PUA"; |
47 | |
|
48 | |
static final int PUA_BASE_CODEPOINT = 0xE900; |
49 | |
static final int SURROGATE_BASE_CODEPOINT = 0xDC00; |
50 | |
static final char PUA_ESCAPE_CHARACTER = (char)(PUA_BASE_CODEPOINT + 0x7F); |
51 | |
static final int MAX_PUA_CODEPOINT = PUA_BASE_CODEPOINT + 0xFF; |
52 | |
|
53 | 1 | private static final Charset UTF_8 = Charset.forName("UTF-8"); |
54 | |
|
55 | |
private final boolean usePUA; |
56 | |
|
57 | |
protected UTF8BinaryCharset(boolean usePUA) { |
58 | 5983 | super(usePUA ? PUA_VARIANT : SURROGATE_VARIANT, null); |
59 | 5983 | this.usePUA = usePUA; |
60 | 5983 | } |
61 | |
|
62 | |
@Override |
63 | |
public boolean contains(Charset cs) { |
64 | 0 | return cs instanceof UTF8BinaryCharset || UTF_8.contains(cs); |
65 | |
} |
66 | |
|
67 | |
@Override |
68 | |
public CharsetDecoder newDecoder() { |
69 | 11966 | return new Decoder(); |
70 | |
} |
71 | |
|
72 | |
@Override |
73 | |
public CharsetEncoder newEncoder() { |
74 | 5983 | return new Encoder(); |
75 | |
} |
76 | |
|
77 | |
protected static boolean isValidCodepoint(int codePoint) { |
78 | 14795365 | return (codePoint >= Character.MIN_CODE_POINT && codePoint < Character.MIN_HIGH_SURROGATE) |
79 | |
|| (codePoint > Character.MAX_LOW_SURROGATE && codePoint <= Character.MAX_CODE_POINT); |
80 | |
} |
81 | |
|
82 | |
protected static boolean needEscaping(int codePoint) { |
83 | 784922 | return codePoint >= PUA_ESCAPE_CHARACTER && codePoint <= MAX_PUA_CODEPOINT; |
84 | |
} |
85 | |
|
86 | |
private class Decoder extends CharsetDecoder { |
87 | |
|
88 | |
|
89 | |
private static final byte STATE_BUFFER_FULL = 0; |
90 | |
private static final byte STATE_BUFFER_FULL_INVALID = -10; |
91 | |
private static final byte STATE_BUFFER_EMPTY = -20; |
92 | |
|
93 | 11966 | private final CharsetDecoder utf8Decoder = UTF_8.newDecoder(); |
94 | 11966 | private final ByteBuffer buffer = ByteBuffer.allocate(4); |
95 | 11966 | private byte state = STATE_BUFFER_EMPTY; |
96 | |
|
97 | 11966 | protected Decoder() { |
98 | 11966 | super(UTF8BinaryCharset.this, 1.0f, 1.0f + (usePUA ? 1 : 0)); |
99 | 11966 | utf8Decoder.onMalformedInput(CodingErrorAction.REPORT); |
100 | 11966 | utf8Decoder.onUnmappableCharacter(CodingErrorAction.REPORT); |
101 | 11966 | } |
102 | |
|
103 | |
@Override |
104 | |
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { |
105 | |
while (true) { |
106 | 28298934 | if (state == STATE_BUFFER_FULL_INVALID) { |
107 | 464402 | if (!writeReplacements(out)) |
108 | 0 | return CoderResult.OVERFLOW; |
109 | |
} |
110 | 28298934 | if (state == STATE_BUFFER_FULL) { |
111 | 7774728 | if (!decodeBuffer(out)) |
112 | 0 | return CoderResult.UNDERFLOW; |
113 | |
} |
114 | 28298934 | if (in.remaining() == 0) |
115 | 11964 | return CoderResult.UNDERFLOW; |
116 | 28286970 | if (out.remaining() == 0) |
117 | 0 | return CoderResult.OVERFLOW; |
118 | 28286970 | byte b = in.get(); |
119 | 28286970 | if (state == STATE_BUFFER_EMPTY) { |
120 | 8239306 | calculateState(b); |
121 | 20047664 | } else if ((b >> 6) != -2) { |
122 | 385046 | if (!writeReplacements(out)) { |
123 | 0 | in.position(in.position() - 1); |
124 | 0 | state = STATE_BUFFER_FULL_INVALID; |
125 | 0 | return CoderResult.OVERFLOW; |
126 | |
} |
127 | 385046 | calculateState(b); |
128 | |
} |
129 | 28286970 | buffer.put(b); |
130 | 28286970 | state--; |
131 | 28286970 | if (state == -1) { |
132 | 0 | state = STATE_BUFFER_FULL_INVALID; |
133 | |
} |
134 | 28286970 | } |
135 | |
} |
136 | |
|
137 | |
private boolean decodeBuffer(CharBuffer out) { |
138 | 7774728 | state = STATE_BUFFER_EMPTY; |
139 | 7774728 | ByteBuffer dup = buffer.duplicate(); |
140 | 7774728 | dup.flip(); |
141 | 7774728 | CharBuffer cb = CharBuffer.allocate(2); |
142 | 7774728 | CoderResult cr = utf8Decoder.decode(dup, cb, false); |
143 | 7774728 | cb.flip(); |
144 | 7774728 | if (cr == CoderResult.UNDERFLOW) { |
145 | |
|
146 | 7212 | } else if (cr == CoderResult.OVERFLOW) { |
147 | 0 | throw new IllegalStateException("UTF8 decoder buffer too small"); |
148 | 7212 | } else if (cr.isMalformed()) { |
149 | 7212 | cb = null; |
150 | 7212 | state = STATE_BUFFER_FULL_INVALID; |
151 | |
} else { |
152 | 0 | if(true)throw new IllegalStateException("Unexpected coder result: " + cr.toString()); |
153 | |
} |
154 | 7774728 | if (cb == null) { |
155 | |
|
156 | 7767516 | } else if (dup.remaining() == 0) { |
157 | 7767516 | if (cb.remaining() == 1) { |
158 | 1506544 | char c = cb.get(); |
159 | 1506544 | if (usePUA && needEscaping(c)) { |
160 | 394 | if (out.remaining() < 2) { |
161 | 0 | state = STATE_BUFFER_FULL; |
162 | 0 | return false; |
163 | |
} |
164 | 394 | out.put(PUA_ESCAPE_CHARACTER); |
165 | 394 | out.put(c); |
166 | 394 | buffer.clear(); |
167 | 1506150 | } else if (isValidCodepoint(c)) { |
168 | 1505880 | out.put(c); |
169 | 1505880 | buffer.clear(); |
170 | |
} else { |
171 | 270 | state = STATE_BUFFER_FULL_INVALID; |
172 | |
} |
173 | 1506544 | } else if (cb.remaining() == 2) { |
174 | 6260972 | char c1 = cb.get(); |
175 | 6260972 | char c2 = cb.get(); |
176 | 6260972 | if (Character.isSurrogatePair(c1, c2) && isValidCodepoint(Character.toCodePoint(c1, c2))) { |
177 | 6260972 | out.put(c1); |
178 | 6260972 | out.put(c2); |
179 | 6260972 | buffer.clear(); |
180 | |
} else { |
181 | 0 | state = STATE_BUFFER_FULL_INVALID; |
182 | |
} |
183 | 6260972 | } else { |
184 | 0 | throw new IllegalStateException("Buffer size is "+cb.remaining()); |
185 | |
} |
186 | |
} else { |
187 | 0 | throw new IllegalStateException("Remaining size is "+dup.remaining()); |
188 | |
} |
189 | 7774728 | if (state == STATE_BUFFER_FULL_INVALID) { |
190 | 7482 | if (!writeReplacements(out)) { |
191 | 0 | return false; |
192 | |
} |
193 | |
} |
194 | 7774728 | return true; |
195 | |
} |
196 | |
|
197 | |
private boolean writeReplacements(CharBuffer out) { |
198 | 862911 | if (out.remaining() < buffer.position()) { |
199 | 0 | return false; |
200 | |
} |
201 | 862911 | buffer.flip(); |
202 | 1775685 | while(buffer.remaining() > 0) { |
203 | 912774 | out.put((char)((usePUA ? PUA_BASE_CODEPOINT : SURROGATE_BASE_CODEPOINT) + ( buffer.get() & 0xFF))); |
204 | |
|
205 | |
} |
206 | 862911 | buffer.clear(); |
207 | 862911 | state = STATE_BUFFER_EMPTY; |
208 | 862911 | return true; |
209 | |
} |
210 | |
|
211 | |
private void calculateState(byte b) { |
212 | 8624352 | if (b >= 0) state = 1; |
213 | 7567125 | else if ((b >> 5) == -2) state = 2; |
214 | 7291725 | else if ((b >> 4) == -2) state = 3; |
215 | 6791812 | else if ((b >> 3) == -2) state = 4; |
216 | 464402 | else state = STATE_BUFFER_FULL_INVALID + 1; |
217 | 8624352 | } |
218 | |
|
219 | |
@Override |
220 | |
protected CoderResult implFlush(CharBuffer out) { |
221 | 5981 | if (!writeReplacements(out)) |
222 | 0 | return CoderResult.OVERFLOW; |
223 | 5981 | return super.implFlush(out); |
224 | |
} |
225 | |
|
226 | |
@Override |
227 | |
protected void implReset() { |
228 | 5981 | buffer.clear(); |
229 | 5981 | state = STATE_BUFFER_EMPTY; |
230 | 5981 | } |
231 | |
} |
232 | |
|
233 | |
class Encoder extends CharsetEncoder { |
234 | |
|
235 | 5983 | private CharsetEncoder utf8Encoder = UTF_8.newEncoder(); |
236 | |
|
237 | 5983 | protected Encoder() { |
238 | 5983 | super(UTF8BinaryCharset.this, 1.1f, 4.0f); |
239 | 5983 | } |
240 | |
|
241 | |
@Override |
242 | |
protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) { |
243 | 5981 | CharBuffer buf = CharBuffer.allocate(2); |
244 | 5981 | int baseCodepoint = (usePUA ? PUA_BASE_CODEPOINT : SURROGATE_BASE_CODEPOINT); |
245 | 8680018 | while (in.remaining() > 0) { |
246 | 8674037 | char c = in.charAt(0); |
247 | 8674037 | boolean escaped = false; |
248 | 8674037 | if (usePUA && c == PUA_ESCAPE_CHARACTER) { |
249 | 394 | if (in.remaining() < 2) |
250 | 0 | return CoderResult.UNDERFLOW; |
251 | 394 | c = in.charAt(1); |
252 | 394 | if (c == PUA_ESCAPE_CHARACTER) { |
253 | 3 | in.get(); |
254 | 3 | escaped = true; |
255 | |
|
256 | 391 | } else if (c >= baseCodepoint + 0x80 && c <= baseCodepoint + 0xFF) { |
257 | 391 | in.get(); |
258 | 391 | escaped = true; |
259 | |
|
260 | |
} else { |
261 | 0 | return CoderResult.unmappableForLength(1); |
262 | |
} |
263 | |
} |
264 | 8674037 | if (!escaped && c >= baseCodepoint + 0x80 && c <= baseCodepoint + 0xFF) { |
265 | 912774 | if (out.remaining() == 0) |
266 | 0 | return CoderResult.OVERFLOW; |
267 | 912774 | in.get(); |
268 | 912774 | out.put((byte)(c-baseCodepoint)); |
269 | |
} else { |
270 | 7761263 | buf.clear(); |
271 | 7761263 | buf.put(c); |
272 | 7761263 | boolean surrogatePair = false; |
273 | 7761263 | if (Character.isHighSurrogate(c)) { |
274 | 6260972 | if (in.remaining() == 1) |
275 | 0 | return CoderResult.UNDERFLOW; |
276 | 6260972 | char c2 = in.charAt(1); |
277 | 6260972 | if (Character.isLowSurrogate(c2)) { |
278 | 6260972 | surrogatePair = true; |
279 | 6260972 | buf.put(c2); |
280 | |
} |
281 | |
} |
282 | 7761263 | buf.flip(); |
283 | 7761263 | CoderResult cr = utf8Encoder.encode(buf, out, false); |
284 | 7761263 | if (cr != CoderResult.UNDERFLOW) |
285 | 0 | return cr; |
286 | 7761263 | if (buf.remaining() > 0) { |
287 | 0 | if (surrogatePair && buf.remaining() == 1) |
288 | 0 | in.get(); |
289 | 0 | return CoderResult.unmappableForLength(buf.remaining()); |
290 | |
} |
291 | 7761263 | in.get(); |
292 | 7761263 | if (surrogatePair) |
293 | 6260972 | in.get(); |
294 | |
} |
295 | 8674037 | } |
296 | 5981 | return CoderResult.UNDERFLOW; |
297 | |
} |
298 | |
} |
299 | |
} |