Utf8Decoder Class
Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The parameters can set an offset into a list of bytes (as int), limit the length of the values to be decoded, and override the default Unicode replacement character. Set the replacementCharacter to null to throw an IllegalArgumentException rather than replace the bad value. The return value from this method can be used as an Iterable (e.g. in a for-loop).
Implements
Constructors
Code new Utf8Decoder._fromListRangeIterator(_ListRange source, [int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) #
Utf8Decoder._fromListRangeIterator(_ListRange source, [ int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : utf8EncodedBytesIterator = source.iterator();
Code new Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) #
Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, length)).iterator();
Methods
Code List<int> decodeRest() #
Decode the remaininder of the characters in this decoder
into a [List
List<int> decodeRest() { List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); int i = 0; while (hasNext()) { codepoints[i++] = next(); } if (i == codepoints.length) { return codepoints; } else { List<int> truncCodepoints = new List<int>(i); truncCodepoints.setRange(0, i, codepoints); return truncCodepoints; } }
Code int next() #
int next() { int value = utf8EncodedBytesIterator.next(); int additionalBytes = 0; if (value < 0) { if (replacementCodepoint != null) { return replacementCodepoint; } else { throw new IllegalArgumentException( "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); } } else if (value <= _UTF8_ONE_BYTE_MAX) { return value; } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { if (replacementCodepoint != null) { return replacementCodepoint; } else { throw new IllegalArgumentException( "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); } } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; additionalBytes = 1; } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; additionalBytes = 2; } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; additionalBytes = 3; } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; additionalBytes = 4; } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; additionalBytes = 5; } else if (replacementCodepoint != null) { return replacementCodepoint; } else { throw new IllegalArgumentException( "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); } int j = 0; while (j < additionalBytes && utf8EncodedBytesIterator.hasNext()) { int nextValue = utf8EncodedBytesIterator.next(); if (nextValue > _UTF8_ONE_BYTE_MAX && nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); } else { // if sequence-starting code unit, reposition cursor to start here if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { utf8EncodedBytesIterator.backup(); } break; } j++; } bool validSequence = (j == additionalBytes && ( value < UNICODE_UTF16_RESERVED_LO || value > UNICODE_UTF16_RESERVED_HI)); bool nonOverlong = (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); bool inRange = value <= UNICODE_VALID_RANGE_MAX; if (validSequence && nonOverlong && inRange) { return value; } else if (replacementCodepoint != null) { return replacementCodepoint; } else { throw new IllegalArgumentException( "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); } }
Fields
Code final _ListRangeIterator utf8EncodedBytesIterator #
final _ListRangeIterator utf8EncodedBytesIterator;