# get the next utf-8 character from the input stream # and encode as UTF8 if possible/necessary sub nextchar { my $self = shift; $self->next; return unless defined $self->[CURRENT]; if ($self->[CURRENT] eq "\x0D") { $self->next; return unless defined($self->[CURRENT]); if ($self->[CURRENT] ne "\x0A") { $self->buffer("\x0A"); } } return unless $self->[ENCODING]; my $n = ord($self->[CURRENT]); # warn(sprintf("ch: 0x%x ($self->[CURRENT])\n", $n)); if (($] < 5.007002) && ($n > 0x7F)) { # utf8 surrogate my $current = $self->[CURRENT]; if ($n >= 0xFC) { # read 5 chars $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; } elsif ($n >= 0xF8) { # read 4 chars $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; } elsif ($n >= 0xF0) { # read 3 chars $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; } elsif ($n >= 0xE0) { # read 2 chars $self->next; $current .= $self->[CURRENT]; $self->next; $current .= $self->[CURRENT]; } elsif ($n >= 0xC0) { # read 1 char $self->next; $current .= $self->[CURRENT]; } else { throw XML::SAX::Exception::Parse( Message => sprintf("Invalid character 0x%x", $n), ColumnNumber => $self->column, LineNumber => $self->line, PublicId => $self->public_id, SystemId => $self->system_id, ); } if ($] >= 5.006001) { $self->[CURRENT] = pack("U0A*", $current); } else { $self->[CURRENT] = $current; } } } ###################################################### # match anything *not* in the list of characters given sub match_not { my $self = shift; my $current = $self->[CURRENT]; return 0 unless defined $current; for my $m (@_) { if ($current eq $m) { $self->[MATCHED] = ''; return 0; } } $self->[MATCHED] = $current; $self->nextchar; return 1; } ###################################################### # get the next byte or character (character on 5.7.2+ # or byte on 5.5 or 5.6) from a fh, or from the buffer # if one exists. sub next { my $self = shift; # check for chars in buffer first. if (length($self->[BUFFER])) { return $self->[CURRENT] = substr($self->[BUFFER], 0, 1, ''); # last param truncates buffer } if (length($self->[INTERNAL_BUFFER])) { BUFFERED_READ: $self->[CURRENT] = substr($self->[INTERNAL_BUFFER], 0, 1, ''); if ($self->[CURRENT] eq "\x0A") { $self->[LINE]++; $self->[COLUMN] = 1; } else { $self->[COLUMN]++ } return; } my $bytesread = read($self->[FH], $self->[INTERNAL_BUFFER], $self->[BUFFER_SIZE]); if ($bytesread) { # yes, goto. If you have a faster way feel free! goto BUFFERED_READ; } elsif (defined($bytesread)) { $self->[EOF]++; return $self->[CURRENT] = undef; } throw XML::SAX::Exception::Parse( Message => "Error reading from filehandle: $!", ); } ########################################################## # consume everything not matching chars listed in @_ sub consume_not { my $self = shift; my $consumed = ''; while(!$self->[EOF] && $self->match_not(@_)) { $consumed .= $self->[MATCHED]; } return length($self->[CONSUMED] = $consumed); } ######################################################## # Does the current token match a single character? sub match_char { my $self = shift; if (defined($self->[CURRENT]) && $self->[CURRENT] eq $_[0]) { $self->[MATCHED] = $_[0]; $self->nextchar; return 1; } $self->[MATCHED] = ''; return 0; } ######################################################### # Does the current token match a single regexp? sub match_re { my $self = shift; if ($self->[CURRENT] =~ $_[0]) { $self->[MATCHED] = $self->[CURRENT]; $self->nextchar; return 1; } $self->[MATCHED] = ''; return 0; }