Dart API Referencedart:utfUtf8Decoder

Utf8Decoder class

Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The parameters can set an offset into a list of bytes (as int), limit the length of the values to be decoded, and override the default Unicode replacement character. Set the replacementCharacter to null to throw an ArgumentError rather than replace the bad value. The return value from this method can be used as an Iterable (e.g. in a for-loop).

class Utf8Decoder implements Iterator<int> {
  final _ListRangeIterator utf8EncodedBytesIterator;
  final int replacementCodepoint;

  Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
      this.replacementCodepoint =
      UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
      utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset,
          length)).iterator();


  Utf8Decoder._fromListRangeIterator(_ListRange source, [
      this.replacementCodepoint =
      UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
      utf8EncodedBytesIterator = source.iterator();

  /** Decode the remaininder of the characters in this decoder
    * into a [List<int>].
    */
  List<int> decodeRest() {
    List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
    int i = 0;
    while (hasNext()) {
      codepoints[i++] = next();
    }
    if (i == codepoints.length) {
      return codepoints;
    } else {
      List<int> truncCodepoints = new List<int>(i);
      truncCodepoints.setRange(0, i, codepoints);
      return truncCodepoints;
    }
  }

  bool hasNext() => utf8EncodedBytesIterator.hasNext();

  int next() {
    int value = utf8EncodedBytesIterator.next();
    int additionalBytes = 0;

    if (value < 0) {
      if (replacementCodepoint != null) {
        return replacementCodepoint;
      } else {
        throw new ArgumentError(
            "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
      }
    } else if (value <= _UTF8_ONE_BYTE_MAX) {
      return value;
    } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
      if (replacementCodepoint != null) {
        return replacementCodepoint;
      } else {
        throw new ArgumentError(
            "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
      }
    } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
      value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
      additionalBytes = 1;
    } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
      value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
      additionalBytes = 2;
    } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
      value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
      additionalBytes = 3;
    } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
      value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
      additionalBytes = 4;
    } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
      value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
      additionalBytes = 5;
    } else if (replacementCodepoint != null) {
      return replacementCodepoint;
    } else {
      throw new ArgumentError(
          "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
    }
    int j = 0;
    while (j < additionalBytes && utf8EncodedBytesIterator.hasNext()) {
      int nextValue = utf8EncodedBytesIterator.next();
      if (nextValue > _UTF8_ONE_BYTE_MAX &&
          nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
        value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
      } else {
        // if sequence-starting code unit, reposition cursor to start here
        if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
          utf8EncodedBytesIterator.backup();
        }
        break;
      }
      j++;
    }
    bool validSequence = (j == additionalBytes && (
        value < UNICODE_UTF16_RESERVED_LO ||
        value > UNICODE_UTF16_RESERVED_HI));
    bool nonOverlong =
        (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
        (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
        (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
    bool inRange = value <= UNICODE_VALID_RANGE_MAX;
    if (validSequence && nonOverlong && inRange) {
      return value;
    } else if (replacementCodepoint != null) {
      return replacementCodepoint;
    } else {
      throw new ArgumentError(
          "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
    }
  }
}

Implements

Iterator<E>

Constructors

new Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) #

Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
    this.replacementCodepoint =
    UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
    utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset,
        length)).iterator();

Properties

final int replacementCodepoint #

final int replacementCodepoint;

final _ListRangeIterator utf8EncodedBytesIterator #

final _ListRangeIterator utf8EncodedBytesIterator;

Methods

List<int> decodeRest() #

Decode the remaininder of the characters in this decoder into a [List].

List<int> decodeRest() {
  List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
  int i = 0;
  while (hasNext()) {
    codepoints[i++] = next();
  }
  if (i == codepoints.length) {
    return codepoints;
  } else {
    List<int> truncCodepoints = new List<int>(i);
    truncCodepoints.setRange(0, i, codepoints);
    return truncCodepoints;
  }
}

bool hasNext() #

Returns whether the Iterator has elements left.

docs inherited from Iterator<E>
bool hasNext() => utf8EncodedBytesIterator.hasNext();

int next() #

Gets the next element in the iteration. Throws a NoMoreElementsException if no element is left.

docs inherited from Iterator<E>
int next() {
  int value = utf8EncodedBytesIterator.next();
  int additionalBytes = 0;

  if (value < 0) {
    if (replacementCodepoint != null) {
      return replacementCodepoint;
    } else {
      throw new ArgumentError(
          "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
    }
  } else if (value <= _UTF8_ONE_BYTE_MAX) {
    return value;
  } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
    if (replacementCodepoint != null) {
      return replacementCodepoint;
    } else {
      throw new ArgumentError(
          "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
    }
  } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
    value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
    additionalBytes = 1;
  } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
    value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
    additionalBytes = 2;
  } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
    value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
    additionalBytes = 3;
  } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
    value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
    additionalBytes = 4;
  } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
    value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
    additionalBytes = 5;
  } else if (replacementCodepoint != null) {
    return replacementCodepoint;
  } else {
    throw new ArgumentError(
        "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
  }
  int j = 0;
  while (j < additionalBytes && utf8EncodedBytesIterator.hasNext()) {
    int nextValue = utf8EncodedBytesIterator.next();
    if (nextValue > _UTF8_ONE_BYTE_MAX &&
        nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
      value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
    } else {
      // if sequence-starting code unit, reposition cursor to start here
      if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
        utf8EncodedBytesIterator.backup();
      }
      break;
    }
    j++;
  }
  bool validSequence = (j == additionalBytes && (
      value < UNICODE_UTF16_RESERVED_LO ||
      value > UNICODE_UTF16_RESERVED_HI));
  bool nonOverlong =
      (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
      (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
      (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
  bool inRange = value <= UNICODE_VALID_RANGE_MAX;
  if (validSequence && nonOverlong && inRange) {
    return value;
  } else if (replacementCodepoint != null) {
    return replacementCodepoint;
  } else {
    throw new ArgumentError(
        "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
  }
}