Coverage Report - net.sf.jmatchparser.util.charset.UTF8BinaryCharset
 
Classes in this File Line Coverage Branch Coverage Complexity
UTF8BinaryCharset
90%
9/10
72%
13/18
6,4
UTF8BinaryCharset$Decoder
82%
78/95
77%
53/68
6,4
UTF8BinaryCharset$Encoder
83%
40/48
70%
28/40
6,4
 
 1  
 /*
 2  
  * Copyright (c) 2010 - 2011 Michael Schierl
 3  
  * 
 4  
  * All rights reserved.
 5  
  * 
 6  
  * Redistribution and use in source and binary forms, with or without
 7  
  * modification, are permitted provided that the following conditions
 8  
  * are met:
 9  
  * 
 10  
  * - Redistributions of source code must retain the above copyright notice,
 11  
  *   this list of conditions and the following disclaimer.
 12  
  *   
 13  
  * - Redistributions in binary form must reproduce the above copyright
 14  
  *   notice, this list of conditions and the following disclaimer in the
 15  
  *   documentation and/or other materials provided with the distribution.
 16  
  *   
 17  
  * - Neither name of the copyright holders nor the names of its
 18  
  *   contributors may be used to endorse or promote products derived from
 19  
  *   this software without specific prior written permission.
 20  
  *   
 21  
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 22  
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 23  
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 24  
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 25  
  * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 26  
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 27  
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 28  
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 29  
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 30  
  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 31  
  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 32  
  */
 33  
 package net.sf.jmatchparser.util.charset;
 34  
 
 35  
 import java.nio.ByteBuffer;
 36  
 import java.nio.CharBuffer;
 37  
 import java.nio.charset.Charset;
 38  
 import java.nio.charset.CharsetDecoder;
 39  
 import java.nio.charset.CharsetEncoder;
 40  
 import java.nio.charset.CoderResult;
 41  
 import java.nio.charset.CodingErrorAction;
 42  
 
 43  11129251
 class UTF8BinaryCharset extends Charset {
 44  
 
 45  
         static final String SURROGATE_VARIANT = "UTF-8-Binary";
 46  
         static final String PUA_VARIANT = "UTF-8-Binary-PUA";
 47  
         
 48  
         static final int PUA_BASE_CODEPOINT = 0xE900;
 49  
         static final int SURROGATE_BASE_CODEPOINT = 0xDC00;
 50  
         static final char PUA_ESCAPE_CHARACTER = (char)(PUA_BASE_CODEPOINT + 0x7F);
 51  
         static final int MAX_PUA_CODEPOINT = PUA_BASE_CODEPOINT + 0xFF;
 52  
         
 53  1
         private static final Charset UTF_8 = Charset.forName("UTF-8");
 54  
         
 55  
         private final boolean usePUA;
 56  
         
 57  
         protected UTF8BinaryCharset(boolean usePUA) {
 58  5983
                 super(usePUA ? PUA_VARIANT : SURROGATE_VARIANT, null);
 59  5983
                 this.usePUA = usePUA;
 60  5983
         }
 61  
 
 62  
         @Override
 63  
         public boolean contains(Charset cs) {
 64  0
                 return cs instanceof UTF8BinaryCharset || UTF_8.contains(cs);
 65  
         }
 66  
 
 67  
         @Override
 68  
         public CharsetDecoder newDecoder() {
 69  11966
                 return new Decoder();
 70  
         }
 71  
 
 72  
         @Override
 73  
         public CharsetEncoder newEncoder() {
 74  5983
                 return new Encoder();
 75  
         }
 76  
         
 77  
         protected static boolean isValidCodepoint(int codePoint) {
 78  14795365
                 return (codePoint >= Character.MIN_CODE_POINT && codePoint < Character.MIN_HIGH_SURROGATE)
 79  
                                 || (codePoint > Character.MAX_LOW_SURROGATE && codePoint <= Character.MAX_CODE_POINT);
 80  
         }
 81  
         
 82  
         protected static boolean needEscaping(int codePoint) {
 83  784922
                 return codePoint >= PUA_ESCAPE_CHARACTER && codePoint <= MAX_PUA_CODEPOINT;
 84  
         }
 85  
 
 86  
         private class Decoder extends CharsetDecoder {
 87  
 
 88  
                 // it is deliberate that this is one less than 1!
 89  
                 private static final byte STATE_BUFFER_FULL = 0;
 90  
                 private static final byte STATE_BUFFER_FULL_INVALID = -10;
 91  
                 private static final byte STATE_BUFFER_EMPTY = -20;
 92  
 
 93  11966
                 private final CharsetDecoder utf8Decoder = UTF_8.newDecoder();
 94  11966
                 private final ByteBuffer buffer = ByteBuffer.allocate(4);
 95  11966
                 private byte state = STATE_BUFFER_EMPTY;
 96  
 
 97  11966
                 protected Decoder() {
 98  11966
                         super(UTF8BinaryCharset.this, 1.0f, 1.0f + (usePUA ? 1 : 0));
 99  11966
                         utf8Decoder.onMalformedInput(CodingErrorAction.REPORT);
 100  11966
                         utf8Decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
 101  11966
                 }
 102  
 
 103  
                 @Override
 104  
                 protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
 105  
                         while (true) {
 106  28298934
                                 if (state == STATE_BUFFER_FULL_INVALID) {
 107  464402
                                         if (!writeReplacements(out))
 108  0
                                                 return CoderResult.OVERFLOW;
 109  
                                 }
 110  28298934
                                 if (state == STATE_BUFFER_FULL) {
 111  7774728
                                         if (!decodeBuffer(out))
 112  0
                                                 return CoderResult.UNDERFLOW;
 113  
                                 }
 114  28298934
                                 if (in.remaining() == 0)
 115  11964
                                         return CoderResult.UNDERFLOW;
 116  28286970
                                 if (out.remaining() == 0)
 117  0
                                         return CoderResult.OVERFLOW;
 118  28286970
                                 byte b = in.get();
 119  28286970
                                 if (state == STATE_BUFFER_EMPTY) {
 120  8239306
                                         calculateState(b);
 121  20047664
                                 } else if ((b >> 6) != -2) {
 122  385046
                                         if (!writeReplacements(out)) {
 123  0
                                                 in.position(in.position() - 1);
 124  0
                                                 state = STATE_BUFFER_FULL_INVALID;
 125  0
                                                 return CoderResult.OVERFLOW;
 126  
                                         }
 127  385046
                                         calculateState(b);
 128  
                                 }
 129  28286970
                                 buffer.put(b);
 130  28286970
                                 state--;
 131  28286970
                                 if (state == -1) {
 132  0
                                         state = STATE_BUFFER_FULL_INVALID;
 133  
                                 }
 134  28286970
                         }
 135  
                 }
 136  
                 
 137  
                 private boolean decodeBuffer(CharBuffer out) {
 138  7774728
                         state = STATE_BUFFER_EMPTY;
 139  7774728
                         ByteBuffer dup = buffer.duplicate();
 140  7774728
                         dup.flip();
 141  7774728
                         CharBuffer cb = CharBuffer.allocate(2);
 142  7774728
                         CoderResult cr = utf8Decoder.decode(dup, cb, false);
 143  7774728
                         cb.flip();
 144  7774728
                         if (cr == CoderResult.UNDERFLOW) {
 145  
                                 // ok
 146  7212
                         } else if (cr == CoderResult.OVERFLOW) {
 147  0
                                 throw new IllegalStateException("UTF8 decoder buffer too small");
 148  7212
                         } else if (cr.isMalformed()) {
 149  7212
                                 cb = null;
 150  7212
                                 state = STATE_BUFFER_FULL_INVALID;
 151  
                         } else {
 152  0
                                 if(true)throw new IllegalStateException("Unexpected coder result: " + cr.toString());
 153  
                         }
 154  7774728
                         if (cb == null) {
 155  
                                 // fall through
 156  7767516
                         } else if (dup.remaining() == 0) {
 157  7767516
                                 if (cb.remaining() == 1) {
 158  1506544
                                         char c = cb.get();
 159  1506544
                                         if (usePUA && needEscaping(c)) {
 160  394
                                                 if (out.remaining() < 2) {
 161  0
                                                         state = STATE_BUFFER_FULL;
 162  0
                                                         return false;
 163  
                                                 }
 164  394
                                                 out.put(PUA_ESCAPE_CHARACTER);
 165  394
                                                 out.put(c);
 166  394
                                                 buffer.clear();
 167  1506150
                                         } else if (isValidCodepoint(c)) {
 168  1505880
                                                 out.put(c);
 169  1505880
                                                 buffer.clear();
 170  
                                         } else {
 171  270
                                                 state = STATE_BUFFER_FULL_INVALID;
 172  
                                         }
 173  1506544
                                 } else if (cb.remaining() == 2) {
 174  6260972
                                         char c1 = cb.get();
 175  6260972
                                         char c2 = cb.get();
 176  6260972
                                         if (Character.isSurrogatePair(c1, c2) && isValidCodepoint(Character.toCodePoint(c1, c2))) {
 177  6260972
                                                 out.put(c1);
 178  6260972
                                                 out.put(c2);
 179  6260972
                                                 buffer.clear();
 180  
                                         } else {
 181  0
                                                 state = STATE_BUFFER_FULL_INVALID;
 182  
                                         }
 183  6260972
                                 } else {
 184  0
                                         throw new IllegalStateException("Buffer size is "+cb.remaining());
 185  
                                 }
 186  
                         } else {
 187  0
                                 throw new IllegalStateException("Remaining size is "+dup.remaining());
 188  
                         }
 189  7774728
                         if (state == STATE_BUFFER_FULL_INVALID) {
 190  7482
                                 if (!writeReplacements(out)) {
 191  0
                                         return false;
 192  
                                 }
 193  
                         }
 194  7774728
                         return true;
 195  
                 }
 196  
  
 197  
                 private boolean writeReplacements(CharBuffer out) {
 198  862911
                         if (out.remaining() < buffer.position()) {
 199  0
                                 return false;
 200  
                         }
 201  862911
                         buffer.flip();
 202  1775685
                         while(buffer.remaining() > 0) {
 203  912774
                                 out.put((char)((usePUA ? PUA_BASE_CODEPOINT : SURROGATE_BASE_CODEPOINT) + (                                buffer.get() & 0xFF))); 
 204  
 
 205  
                         }
 206  862911
                         buffer.clear();
 207  862911
                         state = STATE_BUFFER_EMPTY;
 208  862911
                         return true;
 209  
                 }
 210  
 
 211  
                 private void calculateState(byte b) {
 212  8624352
                         if (b >= 0) state = 1;
 213  7567125
                         else if ((b >> 5) == -2) state = 2;
 214  7291725
                          else if ((b >> 4) == -2) state = 3;
 215  6791812
                          else if ((b >> 3) == -2) state = 4;
 216  464402
                          else state = STATE_BUFFER_FULL_INVALID + 1;
 217  8624352
                 }
 218  
 
 219  
                 @Override
 220  
                 protected CoderResult implFlush(CharBuffer out) {
 221  5981
                         if (!writeReplacements(out))
 222  0
                                 return CoderResult.OVERFLOW;
 223  5981
                         return super.implFlush(out);
 224  
                 }
 225  
 
 226  
                 @Override
 227  
                 protected void implReset() {
 228  5981
                         buffer.clear();
 229  5981
                         state = STATE_BUFFER_EMPTY;
 230  5981
                 }
 231  
         }
 232  
         
 233  
         class Encoder extends CharsetEncoder {
 234  
 
 235  5983
                 private CharsetEncoder utf8Encoder = UTF_8.newEncoder();
 236  
                 
 237  5983
                 protected Encoder() {
 238  5983
                         super(UTF8BinaryCharset.this, 1.1f, 4.0f);
 239  5983
                 }
 240  
 
 241  
                 @Override
 242  
                 protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
 243  5981
                         CharBuffer buf = CharBuffer.allocate(2);
 244  5981
                         int baseCodepoint = (usePUA ? PUA_BASE_CODEPOINT : SURROGATE_BASE_CODEPOINT);
 245  8680018
                         while (in.remaining() > 0) {
 246  8674037
                                 char c = in.charAt(0);
 247  8674037
                                 boolean escaped = false;
 248  8674037
                                 if (usePUA && c == PUA_ESCAPE_CHARACTER) {
 249  394
                                         if (in.remaining() < 2)
 250  0
                                                 return CoderResult.UNDERFLOW;
 251  394
                                         c = in.charAt(1);
 252  394
                                         if (c == PUA_ESCAPE_CHARACTER) {
 253  3
                                                 in.get();
 254  3
                                                 escaped = true;
 255  
                                                 // fall through
 256  391
                                         } else if (c >= baseCodepoint + 0x80 && c <= baseCodepoint + 0xFF) {
 257  391
                                                 in.get();
 258  391
                                                 escaped = true;
 259  
                                                 // fall through
 260  
                                         } else {
 261  0
                                                 return CoderResult.unmappableForLength(1);
 262  
                                         }                                        
 263  
                                 }
 264  8674037
                                 if (!escaped && c >= baseCodepoint + 0x80 && c <= baseCodepoint + 0xFF) {
 265  912774
                                         if (out.remaining() == 0)
 266  0
                                                 return CoderResult.OVERFLOW;
 267  912774
                                         in.get();
 268  912774
                                         out.put((byte)(c-baseCodepoint));
 269  
                                 } else {
 270  7761263
                                         buf.clear();
 271  7761263
                                         buf.put(c);
 272  7761263
                                         boolean surrogatePair = false;
 273  7761263
                                         if (Character.isHighSurrogate(c)) {
 274  6260972
                                                 if (in.remaining() == 1)
 275  0
                                                         return CoderResult.UNDERFLOW;
 276  6260972
                                                 char c2 = in.charAt(1);
 277  6260972
                                                 if (Character.isLowSurrogate(c2)) {
 278  6260972
                                                         surrogatePair = true;
 279  6260972
                                                         buf.put(c2);
 280  
                                                 }
 281  
                                         }
 282  7761263
                                         buf.flip();
 283  7761263
                                         CoderResult cr = utf8Encoder.encode(buf, out, false);
 284  7761263
                                         if (cr != CoderResult.UNDERFLOW)
 285  0
                                                 return cr;
 286  7761263
                                         if (buf.remaining() > 0) {
 287  0
                                                 if (surrogatePair && buf.remaining() == 1)
 288  0
                                                         in.get();
 289  0
                                                 return CoderResult.unmappableForLength(buf.remaining());
 290  
                                         }
 291  7761263
                                         in.get();
 292  7761263
                                         if (surrogatePair)
 293  6260972
                                                 in.get();
 294  
                                 }                                
 295  8674037
                         }
 296  5981
                         return CoderResult.UNDERFLOW;
 297  
                 }
 298  
         }
 299  
 }