Gecko [ www.vortexaion.com ]

Name	Size	Permission	Date
VerticalAligner	[ DIR ]	drwxrwxrwx	2024-07-26 17:38
Debugger.pm	3.5 KB	-rw-rw-rw-	2021-01-09 17:29
DevNull.pm	391 B	-rw-rw-rw-	2021-01-09 17:29
Diagnostics.pm	2.73 KB	-rw-rw-rw-	2021-01-09 17:29
FileWriter.pm	11.81 KB	-rw-rw-rw-	2021-01-09 17:29
Formatter.pm	780.51 KB	-rw-rw-rw-	2021-01-10 16:05
HtmlWriter.pm	49.26 KB	-rw-rw-rw-	2021-01-09 17:29
IOScalar.pm	3.42 KB	-rw-rw-rw-	2021-01-09 17:29
IOScalarArray.pm	2.98 KB	-rw-rw-rw-	2021-01-09 17:29
IndentationItem.pm	7.51 KB	-rw-rw-rw-	2021-01-09 17:29
LineBuffer.pm	2.3 KB	-rw-rw-rw-	2021-01-09 17:29
LineSink.pm	3.39 KB	-rw-rw-rw-	2021-01-09 17:29
LineSource.pm	3.63 KB	-rw-rw-rw-	2021-01-09 17:29
Logger.pm	18.63 KB	-rw-rw-rw-	2021-01-09 17:29
Tokenizer.pm	319.26 KB	-rw-rw-rw-	2021-01-09 17:31
VerticalAligner.pm	197.81 KB	-rw-rw-rw-	2021-01-09 17:30

Rename

#####################################################################
#
# The Perl::Tidy::Tokenizer package is essentially a filter which
# reads lines of perl source code from a source object and provides
# corresponding tokenized lines through its get_line() method.  Lines
# flow from the source_object to the caller like this:
#
# source_object --> LineBuffer_object --> Tokenizer -->  calling routine
#   get_line()         get_line()           get_line()     line_of_tokens
#
# The source object can be any object with a get_line() method which
# supplies one line (a character string) perl call.
# The LineBuffer object is created by the Tokenizer.
# The Tokenizer returns a reference to a data structure 'line_of_tokens'
# containing one tokenized line for each call to its get_line() method.
#
# WARNING: This is not a real class.  Only one tokenizer my be used.
#
########################################################################

package Perl::Tidy::Tokenizer;
use strict;
use warnings;
our $VERSION = '20210111';

use Perl::Tidy::LineBuffer;
use Carp;

# PACKAGE VARIABLES for processing an entire FILE.
# These must be package variables because most may get localized during
# processing.  Most are initialized in sub prepare_for_a_new_file.
use vars qw{
  $tokenizer_self

$last_nonblank_token
  $last_nonblank_type
  $last_nonblank_block_type
  $statement_type
  $in_attribute_list
  $current_package
  $context

%is_constant
  %is_user_function
  %user_function_prototype
  %is_block_function
  %is_block_list_function
  %saw_function_definition
  %saw_use_module

$brace_depth
  $paren_depth
  $square_bracket_depth

@current_depth
  @total_depth
  $total_depth
  @nesting_sequence_number
  @current_sequence_number
  @paren_type
  @paren_semicolon_count
  @paren_structural_type
  @brace_type
  @brace_structural_type
  @brace_context
  @brace_package
  @square_bracket_type
  @square_bracket_structural_type
  @depth_array
  @nested_ternary_flag
  @nested_statement_type
  @starting_line_of_current_depth
};

# GLOBAL CONSTANTS for routines in this package,
# Initialized in a BEGIN block.
use vars qw{
  %is_indirect_object_taker
  %is_block_operator
  %expecting_operator_token
  %expecting_operator_types
  %expecting_term_types
  %expecting_term_token
  %is_digraph
  %is_file_test_operator
  %is_trigraph
  %is_tetragraph
  %is_valid_token_type
  %is_keyword
  %is_code_block_token
  %really_want_term
  @opening_brace_names
  @closing_brace_names
  %is_keyword_taking_list
  %is_keyword_taking_optional_arg
  %is_keyword_rejecting_slash_as_pattern_delimiter
  %is_keyword_rejecting_question_as_pattern_delimiter
  %is_q_qq_qw_qx_qr_s_y_tr_m
  %is_sub
  %is_package
  %is_comma_question_colon
  %other_line_endings
};

# possible values of operator_expected()
use constant TERM     => -1;
use constant UNKNOWN  => 0;
use constant OPERATOR => 1;

# possible values of context
use constant SCALAR_CONTEXT  => -1;
use constant UNKNOWN_CONTEXT => 0;
use constant LIST_CONTEXT    => 1;

# Maximum number of little messages; probably need not be changed.
use constant MAX_NAG_MESSAGES => 6;

BEGIN {

# Array index names for $self
    my $i = 0;
    use constant {
        _rhere_target_list_                  => $i++,
        _in_here_doc_                        => $i++,
        _here_doc_target_                    => $i++,
        _here_quote_character_               => $i++,
        _in_data_                            => $i++,
        _in_end_                             => $i++,
        _in_format_                          => $i++,
        _in_error_                           => $i++,
        _in_pod_                             => $i++,
        _in_attribute_list_                  => $i++,
        _in_quote_                           => $i++,
        _quote_target_                       => $i++,
        _line_start_quote_                   => $i++,
        _starting_level_                     => $i++,
        _know_starting_level_                => $i++,
        _tabsize_                            => $i++,
        _indent_columns_                     => $i++,
        _look_for_hash_bang_                 => $i++,
        _trim_qw_                            => $i++,
        _continuation_indentation_           => $i++,
        _outdent_labels_                     => $i++,
        _last_line_number_                   => $i++,
        _saw_perl_dash_P_                    => $i++,
        _saw_perl_dash_w_                    => $i++,
        _saw_use_strict_                     => $i++,
        _saw_v_string_                       => $i++,
        _hit_bug_                            => $i++,
        _look_for_autoloader_                => $i++,
        _look_for_selfloader_                => $i++,
        _saw_autoloader_                     => $i++,
        _saw_selfloader_                     => $i++,
        _saw_hash_bang_                      => $i++,
        _saw_end_                            => $i++,
        _saw_data_                           => $i++,
        _saw_negative_indentation_           => $i++,
        _started_tokenizing_                 => $i++,
        _line_buffer_object_                 => $i++,
        _debugger_object_                    => $i++,
        _diagnostics_object_                 => $i++,
        _logger_object_                      => $i++,
        _unexpected_error_count_             => $i++,
        _started_looking_for_here_target_at_ => $i++,
        _nearly_matched_here_target_at_      => $i++,
        _line_of_text_                       => $i++,
        _rlower_case_labels_at_              => $i++,
        _extended_syntax_                    => $i++,
        _maximum_level_                      => $i++,
        _true_brace_error_count_             => $i++,
        _rOpts_maximum_level_errors_         => $i++,
        _rOpts_maximum_unexpected_errors_    => $i++,
        _rOpts_logfile_                      => $i++,
        _rOpts_                              => $i++,
    };
}

{    ## closure for subs to count instances

# methods to count instances
    my $_count = 0;
    sub get_count        { return $_count; }
    sub _increment_count { return ++$_count }
    sub _decrement_count { return --$_count }
}

sub DESTROY {
    my $self = shift;
    $self->_decrement_count();
    return;
}

sub AUTOLOAD {

# Catch any undefined sub calls so that we are sure to get
    # some diagnostic information.  This sub should never be called
    # except for a programming error.
    our $AUTOLOAD;
    return if ( $AUTOLOAD =~ /\bDESTROY$/ );
    my ( $pkg, $fname, $lno ) = caller();
    my $my_package = __PACKAGE__;
    print STDERR <<EOM;
======================================================================
Error detected in package '$my_package', version $VERSION
Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
Called from package: '$pkg'  
Called from File '$fname'  at line '$lno'
This error is probably due to a recent programming change
======================================================================
EOM
    exit 1;
}

sub check_options {

# Check Tokenizer parameters
    my $rOpts = shift;

%is_sub = ();
    $is_sub{'sub'} = 1;

# Install any aliases to 'sub'
    if ( $rOpts->{'sub-alias-list'} ) {

# Note that any 'sub-alias-list' has been preprocessed to
        # be a trimmed, space-separated list which includes 'sub'
        # for example, it might be 'sub method fun'
        my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
        foreach my $word (@sub_alias_list) {
            $is_sub{$word} = 1;
        }
    }
    return;
}

sub new {

my ( $class, @args ) = @_;

# Note: 'tabs' and 'indent_columns' are temporary and should be
    # removed asap
    my %defaults = (
        source_object        => undef,
        debugger_object      => undef,
        diagnostics_object   => undef,
        logger_object        => undef,
        starting_level       => undef,
        indent_columns       => 4,
        tabsize              => 8,
        look_for_hash_bang   => 0,
        trim_qw              => 1,
        look_for_autoloader  => 1,
        look_for_selfloader  => 1,
        starting_line_number => 1,
        extended_syntax      => 0,
        rOpts                => {},
    );
    my %args = ( %defaults, @args );

# we are given an object with a get_line() method to supply source lines
    my $source_object = $args{source_object};
    my $rOpts         = $args{rOpts};

# we create another object with a get_line() and peek_ahead() method
    my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);

# Tokenizer state data is as follows:
    # _rhere_target_list_    reference to list of here-doc targets
    # _here_doc_target_      the target string for a here document
    # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
    #                        to determine if interpolation is done
    # _quote_target_         character we seek if chasing a quote
    # _line_start_quote_     line where we started looking for a long quote
    # _in_here_doc_          flag indicating if we are in a here-doc
    # _in_pod_               flag set if we are in pod documentation
    # _in_error_             flag set if we saw severe error (binary in script)
    # _in_data_              flag set if we are in __DATA__ section
    # _in_end_               flag set if we are in __END__ section
    # _in_format_            flag set if we are in a format description
    # _in_attribute_list_    flag telling if we are looking for attributes
    # _in_quote_             flag telling if we are chasing a quote
    # _starting_level_       indentation level of first line
    # _line_buffer_object_   object with get_line() method to supply source code
    # _diagnostics_object_   place to write debugging information
    # _unexpected_error_count_ error count used to limit output
    # _lower_case_labels_at_ line numbers where lower case labels seen
    # _hit_bug_              program bug detected

my $self = [];
    $self->[_rhere_target_list_]        = [];
    $self->[_in_here_doc_]              = 0;
    $self->[_here_doc_target_]          = "";
    $self->[_here_quote_character_]     = "";
    $self->[_in_data_]                  = 0;
    $self->[_in_end_]                   = 0;
    $self->[_in_format_]                = 0;
    $self->[_in_error_]                 = 0;
    $self->[_in_pod_]                   = 0;
    $self->[_in_attribute_list_]        = 0;
    $self->[_in_quote_]                 = 0;
    $self->[_quote_target_]             = "";
    $self->[_line_start_quote_]         = -1;
    $self->[_starting_level_]           = $args{starting_level};
    $self->[_know_starting_level_]      = defined( $args{starting_level} );
    $self->[_tabsize_]                  = $args{tabsize};
    $self->[_indent_columns_]           = $args{indent_columns};
    $self->[_look_for_hash_bang_]       = $args{look_for_hash_bang};
    $self->[_trim_qw_]                  = $args{trim_qw};
    $self->[_continuation_indentation_] = $args{continuation_indentation};
    $self->[_outdent_labels_]           = $args{outdent_labels};
    $self->[_last_line_number_]         = $args{starting_line_number} - 1;
    $self->[_saw_perl_dash_P_]          = 0;
    $self->[_saw_perl_dash_w_]          = 0;
    $self->[_saw_use_strict_]           = 0;
    $self->[_saw_v_string_]             = 0;
    $self->[_hit_bug_]                  = 0;
    $self->[_look_for_autoloader_]      = $args{look_for_autoloader};
    $self->[_look_for_selfloader_]      = $args{look_for_selfloader};
    $self->[_saw_autoloader_]           = 0;
    $self->[_saw_selfloader_]           = 0;
    $self->[_saw_hash_bang_]            = 0;
    $self->[_saw_end_]                  = 0;
    $self->[_saw_data_]                 = 0;
    $self->[_saw_negative_indentation_] = 0;
    $self->[_started_tokenizing_]       = 0;
    $self->[_line_buffer_object_]       = $line_buffer_object;
    $self->[_debugger_object_]          = $args{debugger_object};
    $self->[_diagnostics_object_]       = $args{diagnostics_object};
    $self->[_logger_object_]            = $args{logger_object};
    $self->[_unexpected_error_count_]   = 0;
    $self->[_started_looking_for_here_target_at_] = 0;
    $self->[_nearly_matched_here_target_at_]      = undef;
    $self->[_line_of_text_]                       = "";
    $self->[_rlower_case_labels_at_]              = undef;
    $self->[_extended_syntax_]                    = $args{extended_syntax};
    $self->[_maximum_level_]                      = 0;
    $self->[_true_brace_error_count_]             = 0;
    $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
    $self->[_rOpts_maximum_unexpected_errors_] =
      $rOpts->{'maximum-unexpected-errors'};
    $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
    $self->[_rOpts_]         = $rOpts;
    bless $self, $class;

$tokenizer_self = $self;

prepare_for_a_new_file();
    find_starting_indentation_level();

# This is not a full class yet, so die if an attempt is made to
    # create more than one object.

if ( _increment_count() > 1 ) {
        confess
"Attempt to create more than 1 object in $class, which is not a true class yet\n";
    }

return $self;

}

# interface to Perl::Tidy::Logger routines
sub warning {
    my $msg           = shift;
    my $logger_object = $tokenizer_self->[_logger_object_];
    if ($logger_object) {
        $logger_object->warning($msg);
    }
    return;
}

sub complain {
    my $msg           = shift;
    my $logger_object = $tokenizer_self->[_logger_object_];
    if ($logger_object) {
        $logger_object->complain($msg);
    }
    return;
}

sub write_logfile_entry {
    my $msg           = shift;
    my $logger_object = $tokenizer_self->[_logger_object_];
    if ($logger_object) {
        $logger_object->write_logfile_entry($msg);
    }
    return;
}

sub interrupt_logfile {
    my $logger_object = $tokenizer_self->[_logger_object_];
    if ($logger_object) {
        $logger_object->interrupt_logfile();
    }
    return;
}

sub resume_logfile {
    my $logger_object = $tokenizer_self->[_logger_object_];
    if ($logger_object) {
        $logger_object->resume_logfile();
    }
    return;
}

sub increment_brace_error {
    my $logger_object = $tokenizer_self->[_logger_object_];
    if ($logger_object) {
        $logger_object->increment_brace_error();
    }
    return;
}

sub report_definite_bug {
    $tokenizer_self->[_hit_bug_] = 1;
    my $logger_object = $tokenizer_self->[_logger_object_];
    if ($logger_object) {
        $logger_object->report_definite_bug();
    }
    return;
}

sub brace_warning {
    my $msg           = shift;
    my $logger_object = $tokenizer_self->[_logger_object_];
    if ($logger_object) {
        $logger_object->brace_warning($msg);
    }
    return;
}

sub get_saw_brace_error {
    my $logger_object = $tokenizer_self->[_logger_object_];
    if ($logger_object) {
        return $logger_object->get_saw_brace_error();
    }
    else {
        return 0;
    }
}

sub get_unexpected_error_count {
    my ($self) = @_;
    return $self->[_unexpected_error_count_];
}

# interface to Perl::Tidy::Diagnostics routines
sub write_diagnostics {
    my $msg = shift;
    if ( $tokenizer_self->[_diagnostics_object_] ) {
        $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
    }
    return;
}

sub get_maximum_level {
    return $tokenizer_self->[_maximum_level_];
}

sub report_tokenization_errors {

my ($self) = @_;

# Report any tokenization errors and return a flag '$severe_error'.
    # Set $severe_error = 1 if the tokenizations errors are so severe that
    # the formatter should not attempt to format the file. Instead, it will
    # just output the file verbatim.

# set severe error flag if tokenizer has encountered file reading problems
    # (i.e. unexpected binary characters)
    my $severe_error = $self->[_in_error_];

my $maxle = $self->[_rOpts_maximum_level_errors_];
    my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
    $maxle = 1 unless defined($maxle);
    $maxue = 0 unless defined($maxue);

my $level = get_indentation_level();
    if ( $level != $tokenizer_self->[_starting_level_] ) {
        warning("final indentation level: $level\n");
        my $level_diff = $tokenizer_self->[_starting_level_] - $level;
        if ( $level_diff < 0 ) { $level_diff = -$level_diff }

# Set severe error flag if the level error is greater than 1.
        # The formatter can function for any level error but it is probably
        # best not to attempt formatting for a high level error.
        if ( $maxle >= 0 && $level_diff > $maxle ) {
            $severe_error = 1;
            warning(<<EOM);
Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
EOM
        }
    }

check_final_nesting_depths();

# Likewise, large numbers of brace errors usually indicate non-perl
    # scirpts, so set the severe error flag at a low number.  This is similar
    # to the level check, but different because braces may balance but be
    # incorrectly interlaced.
    if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
        $severe_error = 1;
    }

if ( $tokenizer_self->[_look_for_hash_bang_]
        && !$tokenizer_self->[_saw_hash_bang_] )
    {
        warning(
            "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
    }

if ( $tokenizer_self->[_in_format_] ) {
        warning("hit EOF while in format description\n");
    }

if ( $tokenizer_self->[_in_pod_] ) {

# Just write log entry if this is after __END__ or __DATA__
        # because this happens to often, and it is not likely to be
        # a parsing error.
        if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
            write_logfile_entry(
"hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
            );
        }

else {
            complain(
"hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
            );
        }

}

if ( $tokenizer_self->[_in_here_doc_] ) {
        $severe_error = 1;
        my $here_doc_target = $tokenizer_self->[_here_doc_target_];
        my $started_looking_for_here_target_at =
          $tokenizer_self->[_started_looking_for_here_target_at_];
        if ($here_doc_target) {
            warning(
"hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
            );
        }
        else {
            warning(
"hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
            );
        }
        my $nearly_matched_here_target_at =
          $tokenizer_self->[_nearly_matched_here_target_at_];
        if ($nearly_matched_here_target_at) {
            warning(
"NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
            );
        }
    }

# Something is seriously wrong if we ended inside a quote
    if ( $tokenizer_self->[_in_quote_] ) {
        $severe_error = 1;
        my $line_start_quote = $tokenizer_self->[_line_start_quote_];
        my $quote_target     = $tokenizer_self->[_quote_target_];
        my $what =
          ( $tokenizer_self->[_in_attribute_list_] )
          ? "attribute list"
          : "quote/pattern";
        warning(
"hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
        );
    }

if ( $tokenizer_self->[_hit_bug_] ) {
        $severe_error = 1;
    }

# Multiple "unexpected" type tokenization errors usually indicate parsing
    # non-perl scripts, or that something is seriously wrong, so we should
    # avoid formatting them.  This can happen for example if we run perltidy on
    # a shell script or an html file.  But unfortunately this check can
    # interfere with some extended syntaxes, such as RPerl, so it has to be off
    # by default.
    my $ue_count = $tokenizer_self->[_unexpected_error_count_];
    if ( $maxue > 0 && $ue_count > $maxue ) {
        warning(<<EOM);
Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
EOM
        $severe_error = 1;
    }

unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
        if ( $] < 5.006 ) {
            write_logfile_entry("Suggest including '-w parameter'\n");
        }
        else {
            write_logfile_entry("Suggest including 'use warnings;'\n");
        }
    }

if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
        write_logfile_entry("Use of -P parameter for defines is discouraged\n");
    }

unless ( $tokenizer_self->[_saw_use_strict_] ) {
        write_logfile_entry("Suggest including 'use strict;'\n");
    }

# it is suggested that labels have at least one upper case character
    # for legibility and to avoid code breakage as new keywords are introduced
    if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
        my @lower_case_labels_at =
          @{ $tokenizer_self->[_rlower_case_labels_at_] };
        write_logfile_entry(
            "Suggest using upper case characters in label(s)\n");
        local $" = ')(';
        write_logfile_entry("  defined at line(s): (@lower_case_labels_at)\n");
    }
    return $severe_error;
}

sub report_v_string {

# warn if this version can't handle v-strings
    my $tok = shift;
    unless ( $tokenizer_self->[_saw_v_string_] ) {
        $tokenizer_self->[_saw_v_string_] =
          $tokenizer_self->[_last_line_number_];
    }
    if ( $] < 5.006 ) {
        warning(
"Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
        );
    }
    return;
}

sub get_input_line_number {
    return $tokenizer_self->[_last_line_number_];
}

# returns the next tokenized line
sub get_line {

my $self = shift;

# USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
    # $square_bracket_depth, $paren_depth

my $input_line = $tokenizer_self->[_line_buffer_object_]->get_line();
    $tokenizer_self->[_line_of_text_] = $input_line;

return unless ($input_line);

my $input_line_number = ++$tokenizer_self->[_last_line_number_];

# Find and remove what characters terminate this line, including any
    # control r
    my $input_line_separator = "";
    if ( chomp($input_line) ) { $input_line_separator = $/ }

# The first test here very significantly speeds things up, but be sure to
    # keep the regex and hash %other_line_endings the same.
    if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
        if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
            $input_line_separator = $2 . $input_line_separator;
        }
    }

# for backwards compatibility we keep the line text terminated with
    # a newline character
    $input_line .= "\n";
    $tokenizer_self->[_line_of_text_] = $input_line;    # update

# create a data structure describing this line which will be
    # returned to the caller.

# _line_type codes are:
    #   SYSTEM         - system-specific code before hash-bang line
    #   CODE           - line of perl code (including comments)
    #   POD_START      - line starting pod, such as '=head'
    #   POD            - pod documentation text
    #   POD_END        - last line of pod section, '=cut'
    #   HERE           - text of here-document
    #   HERE_END       - last line of here-doc (target word)
    #   FORMAT         - format section
    #   FORMAT_END     - last line of format section, '.'
    #   DATA_START     - __DATA__ line
    #   DATA           - unidentified text following __DATA__
    #   END_START      - __END__ line
    #   END            - unidentified text following __END__
    #   ERROR          - we are in big trouble, probably not a perl script

# Other variables:
    #   _curly_brace_depth     - depth of curly braces at start of line
    #   _square_bracket_depth  - depth of square brackets at start of line
    #   _paren_depth           - depth of parens at start of line
    #   _starting_in_quote     - this line continues a multi-line quote
    #                            (so don't trim leading blanks!)
    #   _ending_in_quote       - this line ends in a multi-line quote
    #                            (so don't trim trailing blanks!)
    my $line_of_tokens = {
        _line_type                 => 'EOF',
        _line_text                 => $input_line,
        _line_number               => $input_line_number,
        _guessed_indentation_level => 0,
        _curly_brace_depth         => $brace_depth,
        _square_bracket_depth      => $square_bracket_depth,
        _paren_depth               => $paren_depth,
        _quote_character           => '',
##        _rtoken_type               => undef,
##        _rtokens                   => undef,
##        _rlevels                   => undef,
##        _rslevels                  => undef,
##        _rblock_type               => undef,
##        _rcontainer_type           => undef,
##        _rcontainer_environment    => undef,
##        _rtype_sequence            => undef,
##        _rnesting_tokens           => undef,
##        _rci_levels                => undef,
##        _rnesting_blocks           => undef,
##        _starting_in_quote         => 0,
##        _ending_in_quote           => 0,
    };

# must print line unchanged if we are in a here document
    if ( $tokenizer_self->[_in_here_doc_] ) {

$line_of_tokens->{_line_type} = 'HERE';
        my $here_doc_target      = $tokenizer_self->[_here_doc_target_];
        my $here_quote_character = $tokenizer_self->[_here_quote_character_];
        my $candidate_target     = $input_line;
        chomp $candidate_target;

# Handle <<~ targets, which are indicated here by a leading space on
        # the here quote character
        if ( $here_quote_character =~ /^\s/ ) {
            $candidate_target =~ s/^\s*//;
        }
        if ( $candidate_target eq $here_doc_target ) {
            $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
            $line_of_tokens->{_line_type} = 'HERE_END';
            write_logfile_entry("Exiting HERE document $here_doc_target\n");

my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
            if ( @{$rhere_target_list} ) {  # there can be multiple here targets
                ( $here_doc_target, $here_quote_character ) =
                  @{ shift @{$rhere_target_list} };
                $tokenizer_self->[_here_doc_target_] = $here_doc_target;
                $tokenizer_self->[_here_quote_character_] =
                  $here_quote_character;
                write_logfile_entry(
                    "Entering HERE document $here_doc_target\n");
                $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
                $tokenizer_self->[_started_looking_for_here_target_at_] =
                  $input_line_number;
            }
            else {
                $tokenizer_self->[_in_here_doc_]          = 0;
                $tokenizer_self->[_here_doc_target_]      = "";
                $tokenizer_self->[_here_quote_character_] = "";
            }
        }

# check for error of extra whitespace
        # note for PERL6: leading whitespace is allowed
        else {
            $candidate_target =~ s/\s*$//;
            $candidate_target =~ s/^\s*//;
            if ( $candidate_target eq $here_doc_target ) {
                $tokenizer_self->[_nearly_matched_here_target_at_] =
                  $input_line_number;
            }
        }
        return $line_of_tokens;
    }

# must print line unchanged if we are in a format section
    elsif ( $tokenizer_self->[_in_format_] ) {

if ( $input_line =~ /^\.[\s#]*$/ ) {
            write_logfile_entry("Exiting format section\n");
            $tokenizer_self->[_in_format_] = 0;
            $line_of_tokens->{_line_type} = 'FORMAT_END';
        }
        else {
            $line_of_tokens->{_line_type} = 'FORMAT';
        }
        return $line_of_tokens;
    }

# must print line unchanged if we are in pod documentation
    elsif ( $tokenizer_self->[_in_pod_] ) {

$line_of_tokens->{_line_type} = 'POD';
        if ( $input_line =~ /^=cut/ ) {
            $line_of_tokens->{_line_type} = 'POD_END';
            write_logfile_entry("Exiting POD section\n");
            $tokenizer_self->[_in_pod_] = 0;
        }
        if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) {
            warning(
                "Hash-bang in pod can cause older versions of perl to fail! \n"
            );
        }

return $line_of_tokens;
    }

# must print line unchanged if we have seen a severe error (i.e., we
    # are seeing illegal tokens and cannot continue.  Syntax errors do
    # not pass this route).  Calling routine can decide what to do, but
    # the default can be to just pass all lines as if they were after __END__
    elsif ( $tokenizer_self->[_in_error_] ) {
        $line_of_tokens->{_line_type} = 'ERROR';
        return $line_of_tokens;
    }

# print line unchanged if we are __DATA__ section
    elsif ( $tokenizer_self->[_in_data_] ) {

# ...but look for POD
        # Note that the _in_data and _in_end flags remain set
        # so that we return to that state after seeing the
        # end of a pod section
        if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
            $line_of_tokens->{_line_type} = 'POD_START';
            write_logfile_entry("Entering POD section\n");
            $tokenizer_self->[_in_pod_] = 1;
            return $line_of_tokens;
        }
        else {
            $line_of_tokens->{_line_type} = 'DATA';
            return $line_of_tokens;
        }
    }

# print line unchanged if we are in __END__ section
    elsif ( $tokenizer_self->[_in_end_] ) {

# ...but look for POD
        # Note that the _in_data and _in_end flags remain set
        # so that we return to that state after seeing the
        # end of a pod section
        if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
            $line_of_tokens->{_line_type} = 'POD_START';
            write_logfile_entry("Entering POD section\n");
            $tokenizer_self->[_in_pod_] = 1;
            return $line_of_tokens;
        }
        else {
            $line_of_tokens->{_line_type} = 'END';
            return $line_of_tokens;
        }
    }

# check for a hash-bang line if we haven't seen one
    if ( !$tokenizer_self->[_saw_hash_bang_] ) {
        if ( $input_line =~ /^\#\!.*perl\b/ ) {
            $tokenizer_self->[_saw_hash_bang_] = $input_line_number;

# check for -w and -P flags
            if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
                $tokenizer_self->[_saw_perl_dash_P_] = 1;
            }

if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
                $tokenizer_self->[_saw_perl_dash_w_] = 1;
            }

if (
                $input_line_number > 1

# leave any hash bang in a BEGIN block alone
                # i.e. see 'debugger-duck_type.t'
                && !(
                       $last_nonblank_block_type
                    && $last_nonblank_block_type eq 'BEGIN'
                )
                && !$tokenizer_self->[_look_for_hash_bang_]

# Try to avoid giving a false alarm at a simple comment.
                # These look like valid hash-bang lines:

#!/usr/bin/perl -w
                #!   /usr/bin/perl -w
                #!c:\perl\bin\perl.exe

# These are comments:
                #! I love perl
                #!  sunos does not yet provide a /usr/bin/perl

# Comments typically have multiple spaces, which suggests
                # the filter
                && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
              )
            {

# this is helpful for VMS systems; we may have accidentally
                # tokenized some DCL commands
                if ( $tokenizer_self->[_started_tokenizing_] ) {
                    warning(
"There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
                    );
                }
                else {
                    complain("Useless hash-bang after line 1\n");
                }
            }

# Report the leading hash-bang as a system line
            # This will prevent -dac from deleting it
            else {
                $line_of_tokens->{_line_type} = 'SYSTEM';
                return $line_of_tokens;
            }
        }
    }

# wait for a hash-bang before parsing if the user invoked us with -x
    if ( $tokenizer_self->[_look_for_hash_bang_]
        && !$tokenizer_self->[_saw_hash_bang_] )
    {
        $line_of_tokens->{_line_type} = 'SYSTEM';
        return $line_of_tokens;
    }

# a first line of the form ': #' will be marked as SYSTEM
    # since lines of this form may be used by tcsh
    if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
        $line_of_tokens->{_line_type} = 'SYSTEM';
        return $line_of_tokens;
    }

# now we know that it is ok to tokenize the line...
    # the line tokenizer will modify any of these private variables:
    #        _rhere_target_list
    #        _in_data
    #        _in_end
    #        _in_format
    #        _in_error
    #        _in_pod
    #        _in_quote
    my $ending_in_quote_last = $tokenizer_self->[_in_quote_];
    tokenize_this_line($line_of_tokens);

# Now finish defining the return structure and return it
    $line_of_tokens->{_ending_in_quote} = $tokenizer_self->[_in_quote_];

# handle severe error (binary data in script)
    if ( $tokenizer_self->[_in_error_] ) {
        $tokenizer_self->[_in_quote_] = 0;    # to avoid any more messages
        warning("Giving up after error\n");
        $line_of_tokens->{_line_type} = 'ERROR';
        reset_indentation_level(0);           # avoid error messages
        return $line_of_tokens;
    }

# handle start of pod documentation
    if ( $tokenizer_self->[_in_pod_] ) {

# This gets tricky..above a __DATA__ or __END__ section, perl
        # accepts '=cut' as the start of pod section. But afterwards,
        # only pod utilities see it and they may ignore an =cut without
        # leading =head.  In any case, this isn't good.
        if ( $input_line =~ /^=cut\b/ ) {
            if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] )
            {
                complain("=cut while not in pod ignored\n");
                $tokenizer_self->[_in_pod_] = 0;
                $line_of_tokens->{_line_type} = 'POD_END';
            }
            else {
                $line_of_tokens->{_line_type} = 'POD_START';
                warning(
"=cut starts a pod section .. this can fool pod utilities.\n"
                );
                write_logfile_entry("Entering POD section\n");
            }
        }

else {
            $line_of_tokens->{_line_type} = 'POD_START';
            write_logfile_entry("Entering POD section\n");
        }

return $line_of_tokens;
    }

# Update indentation levels for log messages.
    # Skip blank lines and also block comments, unless a logfile is requested.
    # Note that _line_of_text_ is the input line but trimmed from left to right.
    my $lot = $tokenizer_self->[_line_of_text_];
    if ( $lot && ( $self->[_rOpts_logfile_] || substr( $lot, 0, 1 ) ne '#' ) ) {
        my $rlevels = $line_of_tokens->{_rlevels};
        $line_of_tokens->{_guessed_indentation_level} =
          guess_old_indentation_level($input_line);
    }

# see if this line contains here doc targets
    my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
    if ( @{$rhere_target_list} ) {

my ( $here_doc_target, $here_quote_character ) =
          @{ shift @{$rhere_target_list} };
        $tokenizer_self->[_in_here_doc_]          = 1;
        $tokenizer_self->[_here_doc_target_]      = $here_doc_target;
        $tokenizer_self->[_here_quote_character_] = $here_quote_character;
        write_logfile_entry("Entering HERE document $here_doc_target\n");
        $tokenizer_self->[_started_looking_for_here_target_at_] =
          $input_line_number;
    }

# NOTE: __END__ and __DATA__ statements are written unformatted
    # because they can theoretically contain additional characters
    # which are not tokenized (and cannot be read with <DATA> either!).
    if ( $tokenizer_self->[_in_data_] ) {
        $line_of_tokens->{_line_type} = 'DATA_START';
        write_logfile_entry("Starting __DATA__ section\n");
        $tokenizer_self->[_saw_data_] = 1;

# keep parsing after __DATA__ if use SelfLoader was seen
        if ( $tokenizer_self->[_saw_selfloader_] ) {
            $tokenizer_self->[_in_data_] = 0;
            write_logfile_entry(
                "SelfLoader seen, continuing; -nlsl deactivates\n");
        }

return $line_of_tokens;
    }

elsif ( $tokenizer_self->[_in_end_] ) {
        $line_of_tokens->{_line_type} = 'END_START';
        write_logfile_entry("Starting __END__ section\n");
        $tokenizer_self->[_saw_end_] = 1;

# keep parsing after __END__ if use AutoLoader was seen
        if ( $tokenizer_self->[_saw_autoloader_] ) {
            $tokenizer_self->[_in_end_] = 0;
            write_logfile_entry(
                "AutoLoader seen, continuing; -nlal deactivates\n");
        }
        return $line_of_tokens;
    }

# now, finally, we know that this line is type 'CODE'
    $line_of_tokens->{_line_type} = 'CODE';

# remember if we have seen any real code
    if (  !$tokenizer_self->[_started_tokenizing_]
        && $input_line !~ /^\s*$/
        && $input_line !~ /^\s*#/ )
    {
        $tokenizer_self->[_started_tokenizing_] = 1;
    }

if ( $tokenizer_self->[_debugger_object_] ) {
        $tokenizer_self->[_debugger_object_]
          ->write_debug_entry($line_of_tokens);
    }

# Note: if keyword 'format' occurs in this line code, it is still CODE
    # (keyword 'format' need not start a line)
    if ( $tokenizer_self->[_in_format_] ) {
        write_logfile_entry("Entering format section\n");
    }

if ( $tokenizer_self->[_in_quote_]
        and ( $tokenizer_self->[_line_start_quote_] < 0 ) )
    {

#if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
        if ( ( my $quote_target = $tokenizer_self->[_quote_target_] ) !~
            /^\s*$/ )
        {
            $tokenizer_self->[_line_start_quote_] = $input_line_number;
            write_logfile_entry(
                "Start multi-line quote or pattern ending in $quote_target\n");
        }
    }
    elsif ( ( $tokenizer_self->[_line_start_quote_] >= 0 )
        && !$tokenizer_self->[_in_quote_] )
    {
        $tokenizer_self->[_line_start_quote_] = -1;
        write_logfile_entry("End of multi-line quote or pattern\n");
    }

# we are returning a line of CODE
    return $line_of_tokens;
}

sub find_starting_indentation_level {

# We need to find the indentation level of the first line of the
    # script being formatted.  Often it will be zero for an entire file,
    # but if we are formatting a local block of code (within an editor for
    # example) it may not be zero.  The user may specify this with the
    # -sil=n parameter but normally doesn't so we have to guess.
    #
    # USES GLOBAL VARIABLES: $tokenizer_self
    my $starting_level = 0;

# use value if given as parameter
    if ( $tokenizer_self->[_know_starting_level_] ) {
        $starting_level = $tokenizer_self->[_starting_level_];
    }

# if we know there is a hash_bang line, the level must be zero
    elsif ( $tokenizer_self->[_look_for_hash_bang_] ) {
        $tokenizer_self->[_know_starting_level_] = 1;
    }

# otherwise figure it out from the input file
    else {
        my $line;
        my $i = 0;

# keep looking at lines until we find a hash bang or piece of code
        my $msg = "";
        while ( $line =
            $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
        {

# if first line is #! then assume starting level is zero
            if ( $i == 1 && $line =~ /^\#\!/ ) {
                $starting_level = 0;
                last;
            }
            next if ( $line =~ /^\s*#/ );    # skip past comments
            next if ( $line =~ /^\s*$/ );    # skip past blank lines
            $starting_level = guess_old_indentation_level($line);
            last;
        }
        $msg = "Line $i implies starting-indentation-level = $starting_level\n";
        write_logfile_entry("$msg");
    }
    $tokenizer_self->[_starting_level_] = $starting_level;
    reset_indentation_level($starting_level);
    return;
}

sub guess_old_indentation_level {
    my ($line) = @_;

# Guess the indentation level of an input line.
    #
    # For the first line of code this result will define the starting
    # indentation level.  It will mainly be non-zero when perltidy is applied
    # within an editor to a local block of code.
    #
    # This is an impossible task in general because we can't know what tabs
    # meant for the old script and how many spaces were used for one
    # indentation level in the given input script.  For example it may have
    # been previously formatted with -i=7 -et=3.  But we can at least try to
    # make sure that perltidy guesses correctly if it is applied repeatedly to
    # a block of code within an editor, so that the block stays at the same
    # level when perltidy is applied repeatedly.
    #
    # USES GLOBAL VARIABLES: $tokenizer_self
    my $level = 0;

# find leading tabs, spaces, and any statement label
    my $spaces = 0;
    if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {

# If there are leading tabs, we use the tab scheme for this run, if
        # any, so that the code will remain stable when editing.
        if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }

if ($2) { $spaces += length($2) }

# correct for outdented labels
        if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
            $spaces += $tokenizer_self->[_continuation_indentation_];
        }
    }

# compute indentation using the value of -i for this run.
    # If -i=0 is used for this run (which is possible) it doesn't matter
    # what we do here but we'll guess that the old run used 4 spaces per level.
    my $indent_columns = $tokenizer_self->[_indent_columns_];
    $indent_columns = 4 if ( !$indent_columns );
    $level          = int( $spaces / $indent_columns );
    return ($level);
}

# This is a currently unused debug routine
sub dump_functions {

my $fh = *STDOUT;
    foreach my $pkg ( keys %is_user_function ) {
        $fh->print("\nnon-constant subs in package $pkg\n");

foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
            my $msg = "";
            if ( $is_block_list_function{$pkg}{$sub} ) {
                $msg = 'block_list';
            }

if ( $is_block_function{$pkg}{$sub} ) {
                $msg = 'block';
            }
            $fh->print("$sub $msg\n");
        }
    }

foreach my $pkg ( keys %is_constant ) {
        $fh->print("\nconstants and constant subs in package $pkg\n");

foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
            $fh->print("$sub\n");
        }
    }
    return;
}

sub prepare_for_a_new_file {

# previous tokens needed to determine what to expect next
    $last_nonblank_token      = ';';    # the only possible starting state which
    $last_nonblank_type       = ';';    # will make a leading brace a code block
    $last_nonblank_block_type = '';

# scalars for remembering statement types across multiple lines
    $statement_type    = '';            # '' or 'use' or 'sub..' or 'case..'
    $in_attribute_list = 0;

# scalars for remembering where we are in the file
    $current_package = "main";
    $context         = UNKNOWN_CONTEXT;

# hashes used to remember function information
    %is_constant             = ();      # user-defined constants
    %is_user_function        = ();      # user-defined functions
    %user_function_prototype = ();      # their prototypes
    %is_block_function       = ();
    %is_block_list_function  = ();
    %saw_function_definition = ();
    %saw_use_module          = ();

# variables used to track depths of various containers
    # and report nesting errors
    $paren_depth             = 0;
    $brace_depth             = 0;
    $square_bracket_depth    = 0;
    @current_depth           = (0) x scalar @closing_brace_names;
    $total_depth             = 0;
    @total_depth             = ();
    @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
    @current_sequence_number = ();

@paren_type                     = ();
    @paren_semicolon_count          = ();
    @paren_structural_type          = ();
    @brace_type                     = ();
    @brace_structural_type          = ();
    @brace_context                  = ();
    @brace_package                  = ();
    @square_bracket_type            = ();
    @square_bracket_structural_type = ();
    @depth_array                    = ();
    @nested_ternary_flag            = ();
    @nested_statement_type          = ();
    @starting_line_of_current_depth = ();

$paren_type[$paren_depth]            = '';
    $paren_semicolon_count[$paren_depth] = 0;
    $paren_structural_type[$brace_depth] = '';
    $brace_type[$brace_depth] = ';';    # identify opening brace as code block
    $brace_structural_type[$brace_depth]                   = '';
    $brace_context[$brace_depth]                           = UNKNOWN_CONTEXT;
    $brace_package[$paren_depth]                           = $current_package;
    $square_bracket_type[$square_bracket_depth]            = '';
    $square_bracket_structural_type[$square_bracket_depth] = '';

initialize_tokenizer_state();
    return;
}

{    ## closure for sub tokenize_this_line

use constant BRACE          => 0;
    use constant SQUARE_BRACKET => 1;
    use constant PAREN          => 2;
    use constant QUESTION_COLON => 3;

# TV1: scalars for processing one LINE.
    # Re-initialized on each entry to sub tokenize_this_line.
    my (
        $block_type,        $container_type,    $expecting,
        $i,                 $i_tok,             $input_line,
        $input_line_number, $last_nonblank_i,   $max_token_index,
        $next_tok,          $next_type,         $peeked_ahead,
        $prototype,         $rhere_target_list, $rtoken_map,
        $rtoken_type,       $rtokens,           $tok,
        $type,              $type_sequence,     $indent_flag,
    );

# TV2: refs to ARRAYS for processing one LINE
    # Re-initialized on each call.
    my $routput_token_list     = [];    # stack of output token indexes
    my $routput_token_type     = [];    # token types
    my $routput_block_type     = [];    # types of code block
    my $routput_container_type = [];    # paren types, such as if, elsif, ..
    my $routput_type_sequence  = [];    # nesting sequential number
    my $routput_indent_flag    = [];    #

# TV3: SCALARS for quote variables.  These are initialized with a
    # subroutine call and continually updated as lines are processed.
    my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
        $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );

# TV4: SCALARS for multi-line identifiers and
    # statements. These are initialized with a subroutine call
    # and continually updated as lines are processed.
    my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );

# TV5: SCALARS for tracking indentation level.
    # Initialized once and continually updated as lines are
    # processed.
    my (
        $nesting_token_string,      $nesting_type_string,
        $nesting_block_string,      $nesting_block_flag,
        $nesting_list_string,       $nesting_list_flag,
        $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
        $in_statement_continuation, $level_in_tokenizer,
        $slevel_in_tokenizer,       $rslevel_stack,
    );

# TV6: SCALARS for remembering several previous
    # tokens. Initialized once and continually updated as
    # lines are processed.
    my (
        $last_nonblank_container_type,     $last_nonblank_type_sequence,
        $last_last_nonblank_token,         $last_last_nonblank_type,
        $last_last_nonblank_block_type,    $last_last_nonblank_container_type,
        $last_last_nonblank_type_sequence, $last_nonblank_prototype,
    );

# ----------------------------------------------------------------
    # beginning of tokenizer variable access and manipulation routines
    # ----------------------------------------------------------------

sub initialize_tokenizer_state {

# TV1: initialized on each call
        # TV2: initialized on each call
        # TV3:
        $in_quote                = 0;
        $quote_type              = 'Q';
        $quote_character         = "";
        $quote_pos               = 0;
        $quote_depth             = 0;
        $quoted_string_1         = "";
        $quoted_string_2         = "";
        $allowed_quote_modifiers = "";

# TV4:
        $id_scan_state     = '';
        $identifier        = '';
        $want_paren        = "";
        $indented_if_level = 0;

# TV5:
        $nesting_token_string             = "";
        $nesting_type_string              = "";
        $nesting_block_string             = '1';    # initially in a block
        $nesting_block_flag               = 1;
        $nesting_list_string              = '0';    # initially not in a list
        $nesting_list_flag                = 0;      # initially not in a list
        $ci_string_in_tokenizer           = "";
        $continuation_string_in_tokenizer = "0";
        $in_statement_continuation        = 0;
        $level_in_tokenizer               = 0;
        $slevel_in_tokenizer              = 0;
        $rslevel_stack                    = [];

# TV6:
        $last_nonblank_container_type      = '';
        $last_nonblank_type_sequence       = '';
        $last_last_nonblank_token          = ';';
        $last_last_nonblank_type           = ';';
        $last_last_nonblank_block_type     = '';
        $last_last_nonblank_container_type = '';
        $last_last_nonblank_type_sequence  = '';
        $last_nonblank_prototype           = "";
        return;
    }

sub save_tokenizer_state {

my $rTV1 = [
            $block_type,        $container_type,    $expecting,
            $i,                 $i_tok,             $input_line,
            $input_line_number, $last_nonblank_i,   $max_token_index,
            $next_tok,          $next_type,         $peeked_ahead,
            $prototype,         $rhere_target_list, $rtoken_map,
            $rtoken_type,       $rtokens,           $tok,
            $type,              $type_sequence,     $indent_flag,
        ];

my $rTV2 = [
            $routput_token_list,    $routput_token_type,
            $routput_block_type,    $routput_container_type,
            $routput_type_sequence, $routput_indent_flag,
        ];

my $rTV3 = [
            $in_quote,        $quote_type,
            $quote_character, $quote_pos,
            $quote_depth,     $quoted_string_1,
            $quoted_string_2, $allowed_quote_modifiers,
        ];

my $rTV4 =
          [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];

my $rTV5 = [
            $nesting_token_string,      $nesting_type_string,
            $nesting_block_string,      $nesting_block_flag,
            $nesting_list_string,       $nesting_list_flag,
            $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
            $in_statement_continuation, $level_in_tokenizer,
            $slevel_in_tokenizer,       $rslevel_stack,
        ];

my $rTV6 = [
            $last_nonblank_container_type,
            $last_nonblank_type_sequence,
            $last_last_nonblank_token,
            $last_last_nonblank_type,
            $last_last_nonblank_block_type,
            $last_last_nonblank_container_type,
            $last_last_nonblank_type_sequence,
            $last_nonblank_prototype,
        ];
        return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
    }

sub restore_tokenizer_state {
        my ($rstate) = @_;
        my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
        (
            $block_type,        $container_type,    $expecting,
            $i,                 $i_tok,             $input_line,
            $input_line_number, $last_nonblank_i,   $max_token_index,
            $next_tok,          $next_type,         $peeked_ahead,
            $prototype,         $rhere_target_list, $rtoken_map,
            $rtoken_type,       $rtokens,           $tok,
            $type,              $type_sequence,     $indent_flag,
        ) = @{$rTV1};

(
            $routput_token_list,    $routput_token_type,
            $routput_block_type,    $routput_container_type,
            $routput_type_sequence, $routput_type_sequence,
        ) = @{$rTV2};

(
            $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
            $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
        ) = @{$rTV3};

( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
          @{$rTV4};

(
            $nesting_token_string,      $nesting_type_string,
            $nesting_block_string,      $nesting_block_flag,
            $nesting_list_string,       $nesting_list_flag,
            $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
            $in_statement_continuation, $level_in_tokenizer,
            $slevel_in_tokenizer,       $rslevel_stack,
        ) = @{$rTV5};

(
            $last_nonblank_container_type,
            $last_nonblank_type_sequence,
            $last_last_nonblank_token,
            $last_last_nonblank_type,
            $last_last_nonblank_block_type,
            $last_last_nonblank_container_type,
            $last_last_nonblank_type_sequence,
            $last_nonblank_prototype,
        ) = @{$rTV6};
        return;
    }

sub get_indentation_level {

# patch to avoid reporting error if indented if is not terminated
        if ($indented_if_level) { return $level_in_tokenizer - 1 }
        return $level_in_tokenizer;
    }

sub reset_indentation_level {
        $level_in_tokenizer = $slevel_in_tokenizer = shift;
        push @{$rslevel_stack}, $slevel_in_tokenizer;
        return;
    }

sub peeked_ahead {
        my $flag = shift;
        $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
        return $peeked_ahead;
    }

# ------------------------------------------------------------
    # end of tokenizer variable access and manipulation routines
    # ------------------------------------------------------------

# ------------------------------------------------------------
    # beginning of various scanner interface routines
    # ------------------------------------------------------------
    sub scan_replacement_text {

# check for here-docs in replacement text invoked by
        # a substitution operator with executable modifier 'e'.
        #
        # given:
        #  $replacement_text
        # return:
        #  $rht = reference to any here-doc targets
        my ($replacement_text) = @_;

# quick check
        return unless ( $replacement_text =~ /<</ );

write_logfile_entry("scanning replacement text for here-doc targets\n");

# save the logger object for error messages
        my $logger_object = $tokenizer_self->[_logger_object_];

# localize all package variables
        local (
            $tokenizer_self,                 $last_nonblank_token,
            $last_nonblank_type,             $last_nonblank_block_type,
            $statement_type,                 $in_attribute_list,
            $current_package,                $context,
            %is_constant,                    %is_user_function,
            %user_function_prototype,        %is_block_function,
            %is_block_list_function,         %saw_function_definition,
            $brace_depth,                    $paren_depth,
            $square_bracket_depth,           @current_depth,
            @total_depth,                    $total_depth,
            @nesting_sequence_number,        @current_sequence_number,
            @paren_type,                     @paren_semicolon_count,
            @paren_structural_type,          @brace_type,
            @brace_structural_type,          @brace_context,
            @brace_package,                  @square_bracket_type,
            @square_bracket_structural_type, @depth_array,
            @starting_line_of_current_depth, @nested_ternary_flag,
            @nested_statement_type,
        );

# save all lexical variables
        my $rstate = save_tokenizer_state();
        _decrement_count();    # avoid error check for multiple tokenizers

# make a new tokenizer
        my $rOpts = {};
        my $rpending_logfile_message;
        my $source_object = Perl::Tidy::LineSource->new(
            input_file               => \$replacement_text,
            rOpts                    => $rOpts,
            rpending_logfile_message => $rpending_logfile_message,
        );
        my $tokenizer = Perl::Tidy::Tokenizer->new(
            source_object        => $source_object,
            logger_object        => $logger_object,
            starting_line_number => $input_line_number,
        );

# scan the replacement text
        1 while ( $tokenizer->get_line() );

# remove any here doc targets
        my $rht = undef;
        if ( $tokenizer_self->[_in_here_doc_] ) {
            $rht = [];
            push @{$rht},
              [
                $tokenizer_self->[_here_doc_target_],
                $tokenizer_self->[_here_quote_character_]
              ];
            if ( $tokenizer_self->[_rhere_target_list_] ) {
                push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
                $tokenizer_self->[_rhere_target_list_] = undef;
            }
            $tokenizer_self->[_in_here_doc_] = undef;
        }

# now its safe to report errors
        my $severe_error = $tokenizer->report_tokenization_errors();

# TODO: Could propagate a severe error up

# restore all tokenizer lexical variables
        restore_tokenizer_state($rstate);

# return the here doc targets
        return $rht;
    }

sub scan_bare_identifier {
        ( $i, $tok, $type, $prototype ) =
          scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
            $rtoken_map, $max_token_index );
        return;
    }

sub scan_identifier {
        ( $i, $tok, $type, $id_scan_state, $identifier ) =
          scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
            $max_token_index, $expecting, $paren_type[$paren_depth] );
        return;
    }

use constant VERIFY_FASTSCAN => 0;
    my %fast_scan_context;

BEGIN {
        %fast_scan_context = (
            '$' => SCALAR_CONTEXT,
            '*' => SCALAR_CONTEXT,
            '@' => LIST_CONTEXT,
            '%' => LIST_CONTEXT,
            '&' => UNKNOWN_CONTEXT,
        );
    }

sub scan_identifier_fast {

# This is a wrapper for sub scan_identifier. It does a fast preliminary
        # scan for certain common identifiers:
        #   '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
        # If it does not find one of these, or this is a restart, it calls the
        # original scanner directly.

# This gives the same results as the full scanner in about 1/4 the
        # total runtime for a typical input stream.

my $i_begin   = $i;
        my $tok_begin = $tok;
        my $fast_scan_type;

###############################
        # quick scan with leading sigil
        ###############################
        if (  !$id_scan_state
            && $i + 1 <= $max_token_index
            && $fast_scan_context{$tok} )
        {
            $context = $fast_scan_context{$tok};

# look for $var, @var, ...
            if ( $rtoken_type->[ $i + 1 ] eq 'w' ) {
                my $pretype_next = "";
                my $i_next       = $i + 2;
                if ( $i_next <= $max_token_index ) {
                    if (   $rtoken_type->[$i_next] eq 'b'
                        && $i_next < $max_token_index )
                    {
                        $i_next += 1;
                    }
                    $pretype_next = $rtoken_type->[$i_next];
                }
                if ( $pretype_next ne ':' && $pretype_next ne "'" ) {

# Found type 'i' like '$var', '@var', or '%var'
                    $identifier     = $tok . $rtokens->[ $i + 1 ];
                    $tok            = $identifier;
                    $type           = 'i';
                    $i              = $i + 1;
                    $fast_scan_type = $type;
                }
            }

# Look for @{ or %{  .
            # But we must let the full scanner handle things ${ because it may
            # keep going to get a complete identifier like '${#}'  .
            elsif (
                $rtoken_type->[ $i + 1 ] eq '{'
                && (   $tok_begin eq '@'
                    || $tok_begin eq '%' )
              )
            {

$identifier     = $tok;
                $type           = 't';
                $fast_scan_type = $type;
            }
        }

############################
        # Quick scan with leading ->
        # Look for ->[ and ->{
        ############################
        elsif (
               $tok eq '->'
            && $i < $max_token_index
            && (   $rtokens->[ $i + 1 ] eq '{'
                || $rtokens->[ $i + 1 ] eq '[' )
          )
        {
            $type           = $tok;
            $fast_scan_type = $type;
            $identifier     = $tok;
            $context        = UNKNOWN_CONTEXT;
        }

#######################################
        # Verify correctness during development
        #######################################
        if ( VERIFY_FASTSCAN && $fast_scan_type ) {

# We will call the full method
            my $identifier_simple = $identifier;
            my $tok_simple        = $tok;
            my $fast_scan_type    = $type;
            my $i_simple          = $i;
            my $context_simple    = $context;

$tok = $tok_begin;
            $i   = $i_begin;
            scan_identifier();

if (   $tok ne $tok_simple
                || $type ne $fast_scan_type
                || $i != $i_simple
                || $identifier ne $identifier_simple
                || $id_scan_state
                || $context ne $context_simple )
            {
                print STDERR <<EOM;
scan_identifier_fast differs from scan_identifier:
simple:  i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
full:    i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
EOM
            }
        }

###################################################
        # call full scanner if fast method did not succeed
        ###################################################
        if ( !$fast_scan_type ) {
            scan_identifier();
        }
        return;
    }

sub scan_id {
        ( $i, $tok, $type, $id_scan_state ) =
          scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
            $id_scan_state, $max_token_index );
        return;
    }

sub scan_number {
        my $number;
        ( $i, $type, $number ) =
          scan_number_do( $input_line, $i, $rtoken_map, $type,
            $max_token_index );
        return $number;
    }

use constant VERIFY_FASTNUM => 0;

sub scan_number_fast {

# This is a wrapper for sub scan_number. It does a fast preliminary
        # scan for a simple integer.  It calls the original scan_number if it
        # does not find one.

my $i_begin   = $i;
        my $tok_begin = $tok;
        my $number;

##################################
        # Quick check for (signed) integer
        ##################################

# This will be the string of digits:
        my $i_d   = $i;
        my $tok_d = $tok;
        my $typ_d = $rtoken_type->[$i_d];

# check for signed integer
        my $sign = "";
        if (   $typ_d ne 'd'
            && ( $typ_d eq '+' || $typ_d eq '-' )
            && $i_d < $max_token_index )
        {
            $sign = $tok_d;
            $i_d++;
            $tok_d = $rtokens->[$i_d];
            $typ_d = $rtoken_type->[$i_d];
        }

# Handle integers
        if (
            $typ_d eq 'd'
            && (
                $i_d == $max_token_index
                || (   $i_d < $max_token_index
                    && $rtoken_type->[ $i_d + 1 ] ne '.'
                    && $rtoken_type->[ $i_d + 1 ] ne 'w' )
            )
          )
        {
            # Let let full scanner handle multi-digit integers beginning with
            # '0' because there could be error messages.  For example, '009' is
            # not a valid number.

if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
                $number = $sign . $tok_d;
                $type   = 'n';
                $i      = $i_d;
            }
        }

#######################################
        # Verify correctness during development
        #######################################
        if ( VERIFY_FASTNUM && defined($number) ) {

# We will call the full method
            my $type_simple   = $type;
            my $i_simple      = $i;
            my $number_simple = $number;

$tok    = $tok_begin;
            $i      = $i_begin;
            $number = scan_number();

if (   $type ne $type_simple
                || ( $i != $i_simple && $i <= $max_token_index )
                || $number ne $number_simple )
            {
                print STDERR <<EOM;
scan_number_fast differs from scan_number:
simple:  i=$i_simple, type=$type_simple, number=$number_simple
full:  i=$i, type=$type, number=$number
EOM
            }
        }

#########################################
        # call full scanner if may not be integer
        #########################################
        if ( !defined($number) ) {
            $number = scan_number();
        }
        return $number;
    }

# a sub to warn if token found where term expected
    sub error_if_expecting_TERM {
        if ( $expecting == TERM ) {
            if ( $really_want_term{$last_nonblank_type} ) {
                report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
                    $rtoken_map, $rtoken_type, $input_line );
                return 1;
            }
        }
        return;
    }

# a sub to warn if token found where operator expected
    sub error_if_expecting_OPERATOR {
        my $thing = shift;
        if ( $expecting == OPERATOR ) {
            if ( !defined($thing) ) { $thing = $tok }
            report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
                $rtoken_map, $rtoken_type, $input_line );
            if ( $i_tok == 0 ) {
                interrupt_logfile();
                warning("Missing ';' or ',' above?\n");
                resume_logfile();
            }
            return 1;
        }
        return;
    }

# ------------------------------------------------------------
    # end scanner interfaces
    # ------------------------------------------------------------

my %is_for_foreach;
    @_ = qw(for foreach);
    @is_for_foreach{@_} = (1) x scalar(@_);

my %is_my_our_state;
    @_ = qw(my our state);
    @is_my_our_state{@_} = (1) x scalar(@_);

# These keywords may introduce blocks after parenthesized expressions,
    # in the form:
    # keyword ( .... ) { BLOCK }
    # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
    my %is_blocktype_with_paren;
    @_ =
      qw(if elsif unless while until for foreach switch case given when catch);
    @is_blocktype_with_paren{@_} = (1) x scalar(@_);

my %is_case_default;
    @_ = qw(case default);
    @is_case_default{@_} = (1) x scalar(@_);

# ------------------------------------------------------------
    # begin hash of code for handling most token types
    # ------------------------------------------------------------
    my $tokenization_code = {

# no special code for these types yet, but syntax checks
        # could be added

##      '!'   => undef,
##      '!='  => undef,
##      '!~'  => undef,
##      '%='  => undef,
##      '&&=' => undef,
##      '&='  => undef,
##      '+='  => undef,
##      '-='  => undef,
##      '..'  => undef,
##      '..'  => undef,
##      '...' => undef,
##      '.='  => undef,
##      '<<=' => undef,
##      '<='  => undef,
##      '<=>' => undef,
##      '<>'  => undef,
##      '='   => undef,
##      '=='  => undef,
##      '=~'  => undef,
##      '>='  => undef,
##      '>>'  => undef,
##      '>>=' => undef,
##      '\\'  => undef,
##      '^='  => undef,
##      '|='  => undef,
##      '||=' => undef,
##      '//=' => undef,
##      '~'   => undef,
##      '~~'  => undef,
##      '!~~'  => undef,

'>' => sub {
            error_if_expecting_TERM()
              if ( $expecting == TERM );
        },
        '|' => sub {
            error_if_expecting_TERM()
              if ( $expecting == TERM );
        },
        '$' => sub {

# start looking for a scalar
            error_if_expecting_OPERATOR("Scalar")
              if ( $expecting == OPERATOR );
            scan_identifier_fast();

if ( $identifier eq '$^W' ) {
                $tokenizer_self->[_saw_perl_dash_w_] = 1;
            }

# Check for identifier in indirect object slot
            # (vorboard.pl, sort.t).  Something like:
            #   /^(print|printf|sort|exec|system)$/
            if (
                $is_indirect_object_taker{$last_nonblank_token}
                || ( ( $last_nonblank_token eq '(' )
                    && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
                || (   $last_nonblank_type eq 'w'
                    || $last_nonblank_type eq 'U' )    # possible object
              )
            {
                $type = 'Z';
            }
        },
        '(' => sub {

++$paren_depth;
            $paren_semicolon_count[$paren_depth] = 0;
            if ($want_paren) {
                $container_type = $want_paren;
                $want_paren     = "";
            }
            elsif ( $statement_type =~ /^sub\b/ ) {
                $container_type = $statement_type;
            }
            else {
                $container_type = $last_nonblank_token;

# We can check for a syntax error here of unexpected '(',
                # but this is going to get messy...
                if (
                    $expecting == OPERATOR

# be sure this is not a method call of the form
                    # &method(...), $method->(..), &{method}(...),
                    # $ref[2](list) is ok & short for $ref[2]->(list)
                    # NOTE: at present, braces in something like &{ xxx }
                    # are not marked as a block, we might have a method call
                    && $last_nonblank_token !~ /^([\]\}\&]|\-\>)/

)
                {

# ref: camel 3 p 703.
                    if ( $last_last_nonblank_token eq 'do' ) {
                        complain(
"do SUBROUTINE is deprecated; consider & or -> notation\n"
                        );
                    }
                    else {

# if this is an empty list, (), then it is not an
                        # error; for example, we might have a constant pi and
                        # invoke it with pi() or just pi;
                        my ( $next_nonblank_token, $i_next ) =
                          find_next_nonblank_token( $i, $rtokens,
                            $max_token_index );
                        if ( $next_nonblank_token ne ')' ) {
                            my $hint;

# FIXME: this gives an error parsing something like
                            #       $subsubs[0]()(0);
                            # which is a valid syntax (see subsub.t).  We may
                            # need to revise this coding.
                            error_if_expecting_OPERATOR('(');

if ( $last_nonblank_type eq 'C' ) {
                                $hint =
                                  "$last_nonblank_token has a void prototype\n";
                            }
                            elsif ( $last_nonblank_type eq 'i' ) {
                                if (   $i_tok > 0
                                    && $last_nonblank_token =~ /^\$/ )
                                {
                                    $hint =
"Do you mean '$last_nonblank_token->(' ?\n";
                                }
                            }
                            if ($hint) {
                                interrupt_logfile();
                                warning($hint);
                                resume_logfile();
                            }
                        } ## end if ( $next_nonblank_token...
                    } ## end else [ if ( $last_last_nonblank_token...
                } ## end if ( $expecting == OPERATOR...
            }
            $paren_type[$paren_depth] = $container_type;
            ( $type_sequence, $indent_flag ) =
              increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );

# propagate types down through nested parens
            # for example: the second paren in 'if ((' would be structural
            # since the first is.

if ( $last_nonblank_token eq '(' ) {
                $type = $last_nonblank_type;
            }

#     We exclude parens as structural after a ',' because it
            #     causes subtle problems with continuation indentation for
            #     something like this, where the first 'or' will not get
            #     indented.
            #
            #         assert(
            #             __LINE__,
            #             ( not defined $check )
            #               or ref $check
            #               or $check eq "new"
            #               or $check eq "old",
            #         );
            #
            #     Likewise, we exclude parens where a statement can start
            #     because of problems with continuation indentation, like
            #     these:
            #
            #         ($firstline =~ /^#\!.*perl/)
            #         and (print $File::Find::name, "\n")
            #           and (return 1);
            #
            #         (ref($usage_fref) =~ /CODE/)
            #         ? &$usage_fref
            #           : (&blast_usage, &blast_params, &blast_general_params);

else {
                $type = '{';
            }

if ( $last_nonblank_type eq ')' ) {
                warning(
                    "Syntax error? found token '$last_nonblank_type' then '('\n"
                );
            }
            $paren_structural_type[$paren_depth] = $type;

},
        ')' => sub {
            ( $type_sequence, $indent_flag ) =
              decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );

if ( $paren_structural_type[$paren_depth] eq '{' ) {
                $type = '}';
            }

$container_type = $paren_type[$paren_depth];

# restore statement type as 'sub' at closing paren of a signature
            # so that a subsequent ':' is identified as an attribute
            if ( $container_type =~ /^sub\b/ ) {
                $statement_type = $container_type;
            }

#    /^(for|foreach)$/
            if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
                my $num_sc = $paren_semicolon_count[$paren_depth];
                if ( $num_sc > 0 && $num_sc != 2 ) {
                    warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
                }
            }

if ( $paren_depth > 0 ) { $paren_depth-- }
        },
        ',' => sub {
            if ( $last_nonblank_type eq ',' ) {
                complain("Repeated ','s \n");
            }

# patch for operator_expected: note if we are in the list (use.t)
            if ( $statement_type eq 'use' ) { $statement_type = '_use' }
##                FIXME: need to move this elsewhere, perhaps check after a '('
##                elsif ($last_nonblank_token eq '(') {
##                    warning("Leading ','s illegal in some versions of perl\n");
##                }
        },
        ';' => sub {
            $context        = UNKNOWN_CONTEXT;
            $statement_type = '';
            $want_paren     = "";

#    /^(for|foreach)$/
            if ( $is_for_foreach{ $paren_type[$paren_depth] } )
            {    # mark ; in for loop

# Be careful: we do not want a semicolon such as the
                # following to be included:
                #
                #    for (sort {strcoll($a,$b);} keys %investments) {

if (   $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
                    && $square_bracket_depth ==
                    $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
                {

$type = 'f';
                    $paren_semicolon_count[$paren_depth]++;
                }
            }

},
        '"' => sub {
            error_if_expecting_OPERATOR("String")
              if ( $expecting == OPERATOR );
            $in_quote                = 1;
            $type                    = 'Q';
            $allowed_quote_modifiers = "";
        },
        "'" => sub {
            error_if_expecting_OPERATOR("String")
              if ( $expecting == OPERATOR );
            $in_quote                = 1;
            $type                    = 'Q';
            $allowed_quote_modifiers = "";
        },
        '`' => sub {
            error_if_expecting_OPERATOR("String")
              if ( $expecting == OPERATOR );
            $in_quote                = 1;
            $type                    = 'Q';
            $allowed_quote_modifiers = "";
        },
        '/' => sub {
            my $is_pattern;

# a pattern cannot follow certain keywords which take optional
            # arguments, like 'shift' and 'pop'. See also '?'.
            if ( $last_nonblank_type eq 'k'
                && $is_keyword_rejecting_slash_as_pattern_delimiter{
                    $last_nonblank_token} )
            {
                $is_pattern = 0;
            }
            elsif ( $expecting == UNKNOWN ) {    # indeterminate, must guess..
                my $msg;
                ( $is_pattern, $msg ) =
                  guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
                    $max_token_index );

if ($msg) {
                    write_diagnostics("DIVIDE:$msg\n");
                    write_logfile_entry($msg);
                }
            }
            else { $is_pattern = ( $expecting == TERM ) }

if ($is_pattern) {
                $in_quote                = 1;
                $type                    = 'Q';
                $allowed_quote_modifiers = '[msixpodualngc]';
            }
            else {    # not a pattern; check for a /= token

if ( $rtokens->[ $i + 1 ] eq '=' ) {    # form token /=
                    $i++;
                    $tok  = '/=';
                    $type = $tok;
                }

#DEBUG - collecting info on what tokens follow a divide
              # for development of guessing algorithm
              #if ( numerator_expected( $i, $rtokens, $max_token_index ) < 0 ) {
              #    #write_diagnostics( "DIVIDE? $input_line\n" );
              #}
            }
        },
        '{' => sub {

# if we just saw a ')', we will label this block with
            # its type.  We need to do this to allow sub
            # code_block_type to determine if this brace starts a
            # code block or anonymous hash.  (The type of a paren
            # pair is the preceding token, such as 'if', 'else',
            # etc).
            $container_type = "";

# ATTRS: for a '{' following an attribute list, reset
            # things to look like we just saw the sub name
            # FIXME: need to end with \b here??
            if ( $statement_type =~ /^sub/ ) {
                $last_nonblank_token = $statement_type;
                $last_nonblank_type  = 'i';
                $statement_type      = "";
            }

# patch for SWITCH/CASE: hide these keywords from an immediately
            # following opening brace
            elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
                && $statement_type eq $last_nonblank_token )
            {
                $last_nonblank_token = ";";
            }

elsif ( $last_nonblank_token eq ')' ) {
                $last_nonblank_token = $paren_type[ $paren_depth + 1 ];

# defensive move in case of a nesting error (pbug.t)
                # in which this ')' had no previous '('
                # this nesting error will have been caught
                if ( !defined($last_nonblank_token) ) {
                    $last_nonblank_token = 'if';
                }

# check for syntax error here;
                unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
                    if ( $tokenizer_self->[_extended_syntax_] ) {

# we append a trailing () to mark this as an unknown
                        # block type.  This allows perltidy to format some
                        # common extensions of perl syntax.
                        # This is used by sub code_block_type
                        $last_nonblank_token .= '()';
                    }
                    else {
                        my $list =
                          join( ' ', sort keys %is_blocktype_with_paren );
                        warning(
"syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
                        );
                    }
                }
            }

# patch for paren-less for/foreach glitch, part 2.
            # see note below under 'qw'
            elsif ($last_nonblank_token eq 'qw'
                && $is_for_foreach{$want_paren} )
            {
                $last_nonblank_token = $want_paren;
                if ( $last_last_nonblank_token eq $want_paren ) {
                    warning(
"syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
                    );

}
                $want_paren = "";
            }

# now identify which of the three possible types of
            # curly braces we have: hash index container, anonymous
            # hash reference, or code block.

# non-structural (hash index) curly brace pair
            # get marked 'L' and 'R'
            if ( is_non_structural_brace() ) {
                $type = 'L';

# patch for SWITCH/CASE:
                # allow paren-less identifier after 'when'
                # if the brace is preceded by a space
                if (   $statement_type eq 'when'
                    && $last_nonblank_type eq 'i'
                    && $last_last_nonblank_type eq 'k'
                    && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
                {
                    $type       = '{';
                    $block_type = $statement_type;
                }
            }

# code and anonymous hash have the same type, '{', but are
            # distinguished by 'block_type',
            # which will be blank for an anonymous hash
            else {

$block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
                    $max_token_index );

# patch to promote bareword type to function taking block
                if (   $block_type
                    && $last_nonblank_type eq 'w'
                    && $last_nonblank_i >= 0 )
                {
                    if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
                        $routput_token_type->[$last_nonblank_i] = 'G';
                    }
                }

# patch for SWITCH/CASE: if we find a stray opening block brace
                # where we might accept a 'case' or 'when' block, then take it
                if (   $statement_type eq 'case'
                    || $statement_type eq 'when' )
                {
                    if ( !$block_type || $block_type eq '}' ) {
                        $block_type = $statement_type;
                    }
                }
            }

$brace_type[ ++$brace_depth ]        = $block_type;
            $brace_package[$brace_depth]         = $current_package;
            $brace_structural_type[$brace_depth] = $type;
            $brace_context[$brace_depth]         = $context;
            ( $type_sequence, $indent_flag ) =
              increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
        },
        '}' => sub {
            $block_type = $brace_type[$brace_depth];
            if ($block_type) { $statement_type = '' }
            if ( defined( $brace_package[$brace_depth] ) ) {
                $current_package = $brace_package[$brace_depth];
            }

# can happen on brace error (caught elsewhere)
            else {
            }
            ( $type_sequence, $indent_flag ) =
              decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );

if ( $brace_structural_type[$brace_depth] eq 'L' ) {
                $type = 'R';
            }

# propagate type information for 'do' and 'eval' blocks, and also
            # for smartmatch operator.  This is necessary to enable us to know
            # if an operator or term is expected next.
            if ( $is_block_operator{$block_type} ) {
                $tok = $block_type;
            }

$context = $brace_context[$brace_depth];
            if ( $brace_depth > 0 ) { $brace_depth--; }
        },
        '&' => sub {    # maybe sub call? start looking

# We have to check for sub call unless we are sure we
            # are expecting an operator.  This example from s2p
            # got mistaken as a q operator in an early version:
            #   print BODY &q(<<'EOT');
            if ( $expecting != OPERATOR ) {

# But only look for a sub call if we are expecting a term or
                # if there is no existing space after the &.
                # For example we probably don't want & as sub call here:
                #    Fcntl::S_IRUSR & $mode;
                if ( $expecting == TERM || $next_type ne 'b' ) {
                    scan_identifier_fast();
                }
            }
            else {
            }
        },
        '<' => sub {    # angle operator or less than?

if ( $expecting != OPERATOR ) {
                ( $i, $type ) =
                  find_angle_operator_termination( $input_line, $i, $rtoken_map,
                    $expecting, $max_token_index );

##  This message is not very helpful and quite confusing if the above
                ##  routine decided not to write a message with the line number.
                ##  if ( $type eq '<' && $expecting == TERM ) {
                ##      error_if_expecting_TERM();
                ##      interrupt_logfile();
                ##      warning("Unterminated <> operator?\n");
                ##      resume_logfile();
                ##  }

}
            else {
            }
        },
        '?' => sub {    # ?: conditional or starting pattern?

my $is_pattern;

# Patch for rt #126965
            # a pattern cannot follow certain keywords which take optional
            # arguments, like 'shift' and 'pop'. See also '/'.
            if ( $last_nonblank_type eq 'k'
                && $is_keyword_rejecting_question_as_pattern_delimiter{
                    $last_nonblank_token} )
            {
                $is_pattern = 0;
            }

# patch for RT#131288, user constant function without prototype
            # last type is 'U' followed by ?.
            elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
                $is_pattern = 0;
            }
            elsif ( $expecting == UNKNOWN ) {

# In older versions of Perl, a bare ? can be a pattern
                # delimiter.  In perl version 5.22 this was
                # dropped, but we have to support it in order to format
                # older programs. See:
                ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
                # For example, the following line worked
                # at one time:
                #      ?(.*)? && (print $1,"\n");
                # In current versions it would have to be written with slashes:
                #      /(.*)/ && (print $1,"\n");
                my $msg;
                ( $is_pattern, $msg ) =
                  guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
                    $max_token_index );

if ($msg) { write_logfile_entry($msg) }
            }
            else { $is_pattern = ( $expecting == TERM ) }

if ($is_pattern) {
                $in_quote                = 1;
                $type                    = 'Q';
                $allowed_quote_modifiers = '[msixpodualngc]';
            }
            else {
                ( $type_sequence, $indent_flag ) =
                  increase_nesting_depth( QUESTION_COLON,
                    $rtoken_map->[$i_tok] );
            }
        },
        '*' => sub {    # typeglob, or multiply?

if ( $expecting == TERM ) {
                scan_identifier_fast();
            }
            else {

if ( $rtokens->[ $i + 1 ] eq '=' ) {
                    $tok  = '*=';
                    $type = $tok;
                    $i++;
                }
                elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
                    $tok  = '**';
                    $type = $tok;
                    $i++;
                    if ( $rtokens->[ $i + 1 ] eq '=' ) {
                        $tok  = '**=';
                        $type = $tok;
                        $i++;
                    }
                }
            }
        },
        '.' => sub {    # what kind of . ?

if ( $expecting != OPERATOR ) {
                scan_number();
                if ( $type eq '.' ) {
                    error_if_expecting_TERM()
                      if ( $expecting == TERM );
                }
            }
            else {
            }
        },
        ':' => sub {

# if this is the first nonblank character, call it a label
            # since perl seems to just swallow it
            if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
                $type = 'J';
            }

# ATTRS: check for a ':' which introduces an attribute list
            # either after a 'sub' keyword or within a paren list
            elsif ( $statement_type =~ /^sub\b/ ) {
                $type              = 'A';
                $in_attribute_list = 1;
            }

# Withing a signature, unless we are in a ternary.  For example,
            # from 't/filter_example.t':
            #    method foo4 ( $class: $bar ) { $class->bar($bar) }
            elsif ( $paren_type[$paren_depth] =~ /^sub\b/
                && !is_balanced_closing_container(QUESTION_COLON) )
            {
                $type              = 'A';
                $in_attribute_list = 1;
            }

# check for scalar attribute, such as
            # my $foo : shared = 1;
            elsif ($is_my_our_state{$statement_type}
                && $current_depth[QUESTION_COLON] == 0 )
            {
                $type              = 'A';
                $in_attribute_list = 1;
            }

# Look for Switch::Plain syntax if an error would otherwise occur
            # here. Note that we do not need to check if the extended syntax
            # flag is set because otherwise an error would occur, and we would
            # then have to output a message telling the user to set the
            # extended syntax flag to avoid the error.
            #  case 1: {
            #  default: {
            #  default:
            # Note that the line 'default:' will be parsed as a label elsewhere.
            elsif ( $is_case_default{$statement_type}
                && !is_balanced_closing_container(QUESTION_COLON) )
            {
                # mark it as a perltidy label type
                $type = 'J';
            }

# otherwise, it should be part of a ?/: operator
            else {
                ( $type_sequence, $indent_flag ) =
                  decrease_nesting_depth( QUESTION_COLON,
                    $rtoken_map->[$i_tok] );
                if ( $last_nonblank_token eq '?' ) {
                    warning("Syntax error near ? :\n");
                }
            }
        },
        '+' => sub {    # what kind of plus?

if ( $expecting == TERM ) {
                my $number = scan_number_fast();

# unary plus is safest assumption if not a number
                if ( !defined($number) ) { $type = 'p'; }
            }
            elsif ( $expecting == OPERATOR ) {
            }
            else {
                if ( $next_type eq 'w' ) { $type = 'p' }
            }
        },
        '@' => sub {

error_if_expecting_OPERATOR("Array")
              if ( $expecting == OPERATOR );
            scan_identifier_fast();
        },
        '%' => sub {    # hash or modulo?

# first guess is hash if no following blank
            if ( $expecting == UNKNOWN ) {
                if ( $next_type ne 'b' ) { $expecting = TERM }
            }
            if ( $expecting == TERM ) {
                scan_identifier_fast();
            }
        },
        '[' => sub {
            $square_bracket_type[ ++$square_bracket_depth ] =
              $last_nonblank_token;
            ( $type_sequence, $indent_flag ) =
              increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );

# It may seem odd, but structural square brackets have
            # type '{' and '}'.  This simplifies the indentation logic.
            if ( !is_non_structural_brace() ) {
                $type = '{';
            }
            $square_bracket_structural_type[$square_bracket_depth] = $type;
        },
        ']' => sub {
            ( $type_sequence, $indent_flag ) =
              decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );

if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
            {
                $type = '}';
            }

# propagate type information for smartmatch operator.  This is
            # necessary to enable us to know if an operator or term is expected
            # next.
            if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
                $tok = $square_bracket_type[$square_bracket_depth];
            }

if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
        },
        '-' => sub {    # what kind of minus?

if ( ( $expecting != OPERATOR )
                && $is_file_test_operator{$next_tok} )
            {
                my ( $next_nonblank_token, $i_next ) =
                  find_next_nonblank_token( $i + 1, $rtokens,
                    $max_token_index );

# check for a quoted word like "-w=>xx";
                # it is sufficient to just check for a following '='
                if ( $next_nonblank_token eq '=' ) {
                    $type = 'm';
                }
                else {
                    $i++;
                    $tok .= $next_tok;
                    $type = 'F';
                }
            }
            elsif ( $expecting == TERM ) {
                my $number = scan_number_fast();

# maybe part of bareword token? unary is safest
                if ( !defined($number) ) { $type = 'm'; }

}
            elsif ( $expecting == OPERATOR ) {
            }
            else {

if ( $next_type eq 'w' ) {
                    $type = 'm';
                }
            }
        },

'^' => sub {

# check for special variables like ${^WARNING_BITS}
            if ( $expecting == TERM ) {

# FIXME: this should work but will not catch errors
                # because we also have to be sure that previous token is
                # a type character ($,@,%).
                if (   $last_nonblank_token eq '{'
                    && ( $next_tok !~ /^\d/ )
                    && ( $next_tok =~ /^\w/ ) )
                {

if ( $next_tok eq 'W' ) {
                        $tokenizer_self->[_saw_perl_dash_w_] = 1;
                    }
                    $tok  = $tok . $next_tok;
                    $i    = $i + 1;
                    $type = 'w';
                }

else {
                    unless ( error_if_expecting_TERM() ) {

# Something like this is valid but strange:
                        # undef ^I;
                        complain("The '^' seems unusual here\n");
                    }
                }
            }
        },

'::' => sub {    # probably a sub call
            scan_bare_identifier();
        },
        '<<' => sub {    # maybe a here-doc?

##      This check removed because it could be a deprecated here-doc with
##      no specified target.  See example in log 16 Sep 2020.
##            return
##              unless ( $i < $max_token_index )
##              ;          # here-doc not possible if end of line

if ( $expecting != OPERATOR ) {
                my ( $found_target, $here_doc_target, $here_quote_character,
                    $saw_error );
                (
                    $found_target, $here_doc_target, $here_quote_character, $i,
                    $saw_error
                  )
                  = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
                    $max_token_index );

if ($found_target) {
                    push @{$rhere_target_list},
                      [ $here_doc_target, $here_quote_character ];
                    $type = 'h';
                    if ( length($here_doc_target) > 80 ) {
                        my $truncated = substr( $here_doc_target, 0, 80 );
                        complain("Long here-target: '$truncated' ...\n");
                    }
                    elsif ( !$here_doc_target ) {
                        warning(
                            'Use of bare << to mean <<"" is deprecated' . "\n" )
                          unless ($here_quote_character);
                    }
                    elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
                        complain(
                            "Unconventional here-target: '$here_doc_target'\n");
                    }
                }
                elsif ( $expecting == TERM ) {
                    unless ($saw_error) {

# shouldn't happen..
                        warning("Program bug; didn't find here doc target\n");
                        report_definite_bug();
                    }
                }
            }
            else {
            }
        },
        '<<~' => sub {    # a here-doc, new type added in v26
            return
              unless ( $i < $max_token_index )
              ;           # here-doc not possible if end of line
            if ( $expecting != OPERATOR ) {
                my ( $found_target, $here_doc_target, $here_quote_character,
                    $saw_error );
                (
                    $found_target, $here_doc_target, $here_quote_character, $i,
                    $saw_error
                  )
                  = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
                    $max_token_index );

if ($found_target) {

if ( length($here_doc_target) > 80 ) {
                        my $truncated = substr( $here_doc_target, 0, 80 );
                        complain("Long here-target: '$truncated' ...\n");
                    }
                    elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
                        complain(
                            "Unconventional here-target: '$here_doc_target'\n");
                    }

# Note that we put a leading space on the here quote
                    # character indicate that it may be preceded by spaces
                    $here_quote_character = " " . $here_quote_character;
                    push @{$rhere_target_list},
                      [ $here_doc_target, $here_quote_character ];
                    $type = 'h';
                }
                elsif ( $expecting == TERM ) {
                    unless ($saw_error) {

# shouldn't happen..
                        warning("Program bug; didn't find here doc target\n");
                        report_definite_bug();
                    }
                }
            }
            else {
            }
        },
        '->' => sub {

# if -> points to a bare word, we must scan for an identifier,
            # otherwise something like ->y would look like the y operator
            scan_identifier_fast();
        },

# type = 'pp' for pre-increment, '++' for post-increment
        '++' => sub {
            if    ( $expecting == TERM ) { $type = 'pp' }
            elsif ( $expecting == UNKNOWN ) {
                my ( $next_nonblank_token, $i_next ) =
                  find_next_nonblank_token( $i, $rtokens, $max_token_index );
                if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
            }
        },

'=>' => sub {
            if ( $last_nonblank_type eq $tok ) {
                complain("Repeated '=>'s \n");
            }

# patch for operator_expected: note if we are in the list (use.t)
            # TODO: make version numbers a new token type
            if ( $statement_type eq 'use' ) { $statement_type = '_use' }
        },

# type = 'mm' for pre-decrement, '--' for post-decrement
        '--' => sub {

if    ( $expecting == TERM ) { $type = 'mm' }
            elsif ( $expecting == UNKNOWN ) {
                my ( $next_nonblank_token, $i_next ) =
                  find_next_nonblank_token( $i, $rtokens, $max_token_index );
                if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
            }
        },

'&&' => sub {
            error_if_expecting_TERM()
              if ( $expecting == TERM );
        },

'||' => sub {
            error_if_expecting_TERM()
              if ( $expecting == TERM );
        },

'//' => sub {
            error_if_expecting_TERM()
              if ( $expecting == TERM );
        },
    };

# ------------------------------------------------------------
    # end hash of code for handling individual token types
    # ------------------------------------------------------------

my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );

# These block types terminate statements and do not need a trailing
    # semicolon
    # patched for SWITCH/CASE/
    my %is_zero_continuation_block_type;
    @_ = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
      if elsif else unless while until for foreach switch case given when);
    @is_zero_continuation_block_type{@_} = (1) x scalar(@_);

my %is_not_zero_continuation_block_type;
    @_ = qw(sort grep map do eval);
    @is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);

my %is_logical_container;
    @_ = qw(if elsif unless while and or err not && !  || for foreach);
    @is_logical_container{@_} = (1) x scalar(@_);

my %is_binary_type;
    @_ = qw(|| &&);
    @is_binary_type{@_} = (1) x scalar(@_);

my %is_binary_keyword;
    @_ = qw(and or err eq ne cmp);
    @is_binary_keyword{@_} = (1) x scalar(@_);

# 'L' is token for opening { at hash key
    my %is_opening_type;
    @_ = qw< L { ( [ >;
    @is_opening_type{@_} = (1) x scalar(@_);

# 'R' is token for closing } at hash key
    my %is_closing_type;
    @_ = qw< R } ) ] >;
    @is_closing_type{@_} = (1) x scalar(@_);

my %is_redo_last_next_goto;
    @_ = qw(redo last next goto);
    @is_redo_last_next_goto{@_} = (1) x scalar(@_);

my %is_use_require;
    @_ = qw(use require);
    @is_use_require{@_} = (1) x scalar(@_);

# This hash holds the array index in $tokenizer_self for these keywords:
    my %is_format_END_DATA = (
        'format'   => _in_format_,
        '__END__'  => _in_end_,
        '__DATA__' => _in_data_,
    );

# original ref: camel 3 p 147,
    # but perl may accept undocumented flags
    # perl 5.10 adds 'p' (preserve)
    # Perl version 5.22 added 'n'
    # From http://perldoc.perl.org/perlop.html we have
    # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
    # s/PATTERN/REPLACEMENT/msixpodualngcer
    # y/SEARCHLIST/REPLACEMENTLIST/cdsr
    # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
    # qr/STRING/msixpodualn
    my %quote_modifiers = (
        's'  => '[msixpodualngcer]',
        'y'  => '[cdsr]',
        'tr' => '[cdsr]',
        'm'  => '[msixpodualngc]',
        'qr' => '[msixpodualn]',
        'q'  => "",
        'qq' => "",
        'qw' => "",
        'qx' => "",
    );

# table showing how many quoted things to look for after quote operator..
    # s, y, tr have 2 (pattern and replacement)
    # others have 1 (pattern only)
    my %quote_items = (
        's'  => 2,
        'y'  => 2,
        'tr' => 2,
        'm'  => 1,
        'qr' => 1,
        'q'  => 1,
        'qq' => 1,
        'qw' => 1,
        'qx' => 1,
    );

use constant DEBUG_TOKENIZE => 0;

sub tokenize_this_line {

# This routine breaks a line of perl code into tokens which are of use in
  # indentation and reformatting.  One of my goals has been to define tokens
  # such that a newline may be inserted between any pair of tokens without
  # changing or invalidating the program. This version comes close to this,
  # although there are necessarily a few exceptions which must be caught by
  # the formatter.  Many of these involve the treatment of bare words.
  #
  # The tokens and their types are returned in arrays.  See previous
  # routine for their names.
  #
  # See also the array "valid_token_types" in the BEGIN section for an
  # up-to-date list.
  #
  # To simplify things, token types are either a single character, or they
  # are identical to the tokens themselves.
  #
  # As a debugging aid, the -D flag creates a file containing a side-by-side
  # comparison of the input string and its tokenization for each line of a file.
  # This is an invaluable debugging aid.
  #
  # In addition to tokens, and some associated quantities, the tokenizer
  # also returns flags indication any special line types.  These include
  # quotes, here_docs, formats.
  #
  # -----------------------------------------------------------------------
  #
  # How to add NEW_TOKENS:
  #
  # New token types will undoubtedly be needed in the future both to keep up
  # with changes in perl and to help adapt the tokenizer to other applications.
  #
  # Here are some notes on the minimal steps.  I wrote these notes while
  # adding the 'v' token type for v-strings, which are things like version
  # numbers 5.6.0, and ip addresses, and will use that as an example.  ( You
  # can use your editor to search for the string "NEW_TOKENS" to find the
  # appropriate sections to change):
  #
  # *. Try to talk somebody else into doing it!  If not, ..
  #
  # *. Make a backup of your current version in case things don't work out!
  #
  # *. Think of a new, unused character for the token type, and add to
  # the array @valid_token_types in the BEGIN section of this package.
  # For example, I used 'v' for v-strings.
  #
  # *. Implement coding to recognize the $type of the token in this routine.
  # This is the hardest part, and is best done by imitating or modifying
  # some of the existing coding.  For example, to recognize v-strings, I
  # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
  # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
  #
  # *. Update sub operator_expected.  This update is critically important but
  # the coding is trivial.  Look at the comments in that routine for help.
  # For v-strings, which should behave like numbers, I just added 'v' to the
  # regex used to handle numbers and strings (types 'n' and 'Q').
  #
  # *. Implement a 'bond strength' rule in sub set_bond_strengths in
  # Perl::Tidy::Formatter for breaking lines around this token type.  You can
  # skip this step and take the default at first, then adjust later to get
  # desired results.  For adding type 'v', I looked at sub bond_strength and
  # saw that number type 'n' was using default strengths, so I didn't do
  # anything.  I may tune it up someday if I don't like the way line
  # breaks with v-strings look.
  #
  # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
  # Perl::Tidy::Formatter.  For adding type 'v', I looked at this routine
  # and saw that type 'n' used spaces on both sides, so I just added 'v'
  # to the array @spaces_both_sides.
  #
  # *. Update HtmlWriter package so that users can colorize the token as
  # desired.  This is quite easy; see comments identified by 'NEW_TOKENS' in
  # that package.  For v-strings, I initially chose to use a default color
  # equal to the default for numbers, but it might be nice to change that
  # eventually.
  #
  # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
  #
  # *. Run lots and lots of debug tests.  Start with special files designed
  # to test the new token type.  Run with the -D flag to create a .DEBUG
  # file which shows the tokenization.  When these work ok, test as many old
  # scripts as possible.  Start with all of the '.t' files in the 'test'
  # directory of the distribution file.  Compare .tdy output with previous
  # version and updated version to see the differences.  Then include as
  # many more files as possible. My own technique has been to collect a huge
  # number of perl scripts (thousands!) into one directory and run perltidy
  # *, then run diff between the output of the previous version and the
  # current version.
  #
  # *. For another example, search for the smartmatch operator '~~'
  # with your editor to see where updates were made for it.
  #
  # -----------------------------------------------------------------------

my $line_of_tokens = shift;
        my ($untrimmed_input_line) = $line_of_tokens->{_line_text};

# patch while coding change is underway
        # make callers private data to allow access
        # $tokenizer_self = $caller_tokenizer_self;

# extract line number for use in error messages
        $input_line_number = $line_of_tokens->{_line_number};

# reinitialize for multi-line quote
        $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';

# check for pod documentation
        if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
            && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
        {

# must not be in multi-line quote
            # and must not be in an equation
            if ( !$in_quote
                && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
            {
                $tokenizer_self->[_in_pod_] = 1;
                return;
            }
        }

$input_line = $untrimmed_input_line;

chomp $input_line;

# Set a flag to indicate if we might be at an __END__ or __DATA__ line
        # This will be used below to avoid quoting a bare word followed by
        # a fat comma.
        my $is_END_or_DATA;

# trim start of this line unless we are continuing a quoted line
        # do not trim end because we might end in a quote (test: deken4.pl)
        # Perl::Tidy::Formatter will delete needless trailing blanks
        unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
            $input_line =~ s/^\s+//;    # trim left end

$is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
              && $input_line =~ /^\s*__(END|DATA)__\s*$/;
        }

# update the copy of the line for use in error messages
        # This must be exactly what we give the pre_tokenizer
        $tokenizer_self->[_line_of_text_] = $input_line;

# re-initialize for the main loop
        $routput_token_list     = [];    # stack of output token indexes
        $routput_token_type     = [];    # token types
        $routput_block_type     = [];    # types of code block
        $routput_container_type = [];    # paren types, such as if, elsif, ..
        $routput_type_sequence  = [];    # nesting sequential number

$rhere_target_list = [];

$tok             = $last_nonblank_token;
        $type            = $last_nonblank_type;
        $prototype       = $last_nonblank_prototype;
        $last_nonblank_i = -1;
        $block_type      = $last_nonblank_block_type;
        $container_type  = $last_nonblank_container_type;
        $type_sequence   = $last_nonblank_type_sequence;
        $indent_flag     = 0;
        $peeked_ahead    = 0;

# tokenization is done in two stages..
        # stage 1 is a very simple pre-tokenization
        my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens

# a little optimization for a full-line comment
        if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) {
            $max_tokens_wanted = 1    # no use tokenizing a comment
        }

# start by breaking the line into pre-tokens
        ( $rtokens, $rtoken_map, $rtoken_type ) =
          pre_tokenize( $input_line, $max_tokens_wanted );

$max_token_index = scalar( @{$rtokens} ) - 1;
        push( @{$rtokens}, ' ', ' ', ' ' );  # extra whitespace simplifies logic
        push( @{$rtoken_map},  0,   0,   0 );     # shouldn't be referenced
        push( @{$rtoken_type}, 'b', 'b', 'b' );

# initialize for main loop
        foreach my $ii ( 0 .. $max_token_index + 3 ) {
            $routput_token_type->[$ii]     = "";
            $routput_block_type->[$ii]     = "";
            $routput_container_type->[$ii] = "";
            $routput_type_sequence->[$ii]  = "";
            $routput_indent_flag->[$ii]    = 0;
        }
        $i     = -1;
        $i_tok = -1;

# ------------------------------------------------------------
        # begin main tokenization loop
        # ------------------------------------------------------------

# we are looking at each pre-token of one line and combining them
        # into tokens
        while ( ++$i <= $max_token_index ) {

if ($in_quote) {    # continue looking for end of a quote
                $type = $quote_type;

unless ( @{$routput_token_list} )
                {               # initialize if continuation line
                    push( @{$routput_token_list}, $i );
                    $routput_token_type->[$i] = $type;

}
                $tok = $quote_character if ($quote_character);

# scan for the end of the quote or pattern
                (
                    $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
                    $quoted_string_1, $quoted_string_2
                  )
                  = do_quote(
                    $i,               $in_quote,    $quote_character,
                    $quote_pos,       $quote_depth, $quoted_string_1,
                    $quoted_string_2, $rtokens,     $rtoken_map,
                    $max_token_index
                  );

# all done if we didn't find it
                last if ($in_quote);

# save pattern and replacement text for rescanning
                my $qs1 = $quoted_string_1;
                my $qs2 = $quoted_string_2;

# re-initialize for next search
                $quote_character = '';
                $quote_pos       = 0;
                $quote_type      = 'Q';
                $quoted_string_1 = "";
                $quoted_string_2 = "";
                last if ( ++$i > $max_token_index );

# look for any modifiers
                if ($allowed_quote_modifiers) {

# check for exact quote modifiers
                    if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
                        my $str = $rtokens->[$i];
                        my $saw_modifier_e;
                        while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
                            my $pos  = pos($str);
                            my $char = substr( $str, $pos - 1, 1 );
                            $saw_modifier_e ||= ( $char eq 'e' );
                        }

# For an 'e' quote modifier we must scan the replacement
                        # text for here-doc targets...
                        # but if the modifier starts a new line we can skip
                        # this because either the here doc will be fully
                        # contained in the replacement text (so we can
                        # ignore it) or Perl will not find it.
                        # See test 'here2.in'.
                        if ( $saw_modifier_e && $i_tok >= 0 ) {

my $rht = scan_replacement_text($qs1);

# Change type from 'Q' to 'h' for quotes with
                            # here-doc targets so that the formatter (see sub
                            # process_line_of_CODE) will not make any line
                            # breaks after this point.
                            if ($rht) {
                                push @{$rhere_target_list}, @{$rht};
                                $type = 'h';
                                if ( $i_tok < 0 ) {
                                    my $ilast = $routput_token_list->[-1];
                                    $routput_token_type->[$ilast] = $type;
                                }
                            }
                        }

if ( defined( pos($str) ) ) {

# matched
                            if ( pos($str) == length($str) ) {
                                last if ( ++$i > $max_token_index );
                            }

# Looks like a joined quote modifier
                            # and keyword, maybe something like
                            # s/xxx/yyy/gefor @k=...
                            # Example is "galgen.pl".  Would have to split
                            # the word and insert a new token in the
                            # pre-token list.  This is so rare that I haven't
                            # done it.  Will just issue a warning citation.

# This error might also be triggered if my quote
                            # modifier characters are incomplete
                            else {
                                warning(<<EOM);

Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
Please put a space between quote modifiers and trailing keywords.
EOM

# print "token $rtokens->[$i]\n";
                         # my $num = length($str) - pos($str);
                         # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
                         # print "continuing with new token $rtokens->[$i]\n";

# skipping past this token does least damage
                                last if ( ++$i > $max_token_index );
                            }
                        }
                        else {

# example file: rokicki4.pl
                            # This error might also be triggered if my quote
                            # modifier characters are incomplete
                            write_logfile_entry(
"Note: found word $str at quote modifier location\n"
                            );
                        }
                    }

# re-initialize
                    $allowed_quote_modifiers = "";
                }
            }

unless ( $type eq 'b' || $tok eq 'CORE::' ) {

# try to catch some common errors
                if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {

if ( $last_nonblank_token eq 'eq' ) {
                        complain("Should 'eq' be '==' here ?\n");
                    }
                    elsif ( $last_nonblank_token eq 'ne' ) {
                        complain("Should 'ne' be '!=' here ?\n");
                    }
                }

$last_last_nonblank_token      = $last_nonblank_token;
                $last_last_nonblank_type       = $last_nonblank_type;
                $last_last_nonblank_block_type = $last_nonblank_block_type;
                $last_last_nonblank_container_type =
                  $last_nonblank_container_type;
                $last_last_nonblank_type_sequence =
                  $last_nonblank_type_sequence;
                $last_nonblank_token          = $tok;
                $last_nonblank_type           = $type;
                $last_nonblank_prototype      = $prototype;
                $last_nonblank_block_type     = $block_type;
                $last_nonblank_container_type = $container_type;
                $last_nonblank_type_sequence  = $type_sequence;
                $last_nonblank_i              = $i_tok;
            }

# store previous token type
            if ( $i_tok >= 0 ) {
                $routput_token_type->[$i_tok]     = $type;
                $routput_block_type->[$i_tok]     = $block_type;
                $routput_container_type->[$i_tok] = $container_type;
                $routput_type_sequence->[$i_tok]  = $type_sequence;
                $routput_indent_flag->[$i_tok]    = $indent_flag;
            }
            my $pre_tok  = $rtokens->[$i];        # get the next pre-token
            my $pre_type = $rtoken_type->[$i];    # and type
            $tok        = $pre_tok;
            $type       = $pre_type;              # to be modified as necessary
            $block_type = "";    # blank for all tokens except code block braces
            $container_type = "";    # blank for all tokens except some parens
            $type_sequence  = "";    # blank for all tokens except ?/:
            $indent_flag    = 0;
            $prototype = "";    # blank for all tokens except user defined subs
            $i_tok     = $i;

# this pre-token will start an output token
            push( @{$routput_token_list}, $i_tok );

# continue gathering identifier if necessary
            # but do not start on blanks and comments
            if ( $id_scan_state && $pre_type ne 'b' && $pre_type ne '#' ) {

if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
                    scan_id();
                }
                else {
                    scan_identifier();
                }

last if ($id_scan_state);
                next if ( ( $i > 0 ) || $type );

# didn't find any token; start over
                $type = $pre_type;
                $tok  = $pre_tok;
            }

# handle whitespace tokens..
            next if ( $type eq 'b' );
            my $prev_tok  = $i > 0 ? $rtokens->[ $i - 1 ]     : ' ';
            my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';

# Build larger tokens where possible, since we are not in a quote.
            #
            # First try to assemble digraphs.  The following tokens are
            # excluded and handled specially:
            # '/=' is excluded because the / might start a pattern.
            # 'x=' is excluded since it might be $x=, with $ on previous line
            # '**' and *= might be typeglobs of punctuation variables
            # I have allowed tokens starting with <, such as <=,
            # because I don't think these could be valid angle operators.
            # test file: storrs4.pl
            my $test_tok   = $tok . $rtokens->[ $i + 1 ];
            my $combine_ok = $is_digraph{$test_tok};

# check for special cases which cannot be combined
            if ($combine_ok) {

# '//' must be defined_or operator if an operator is expected.
                # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
                # could be migrated here for clarity

# Patch for RT#102371, misparsing a // in the following snippet:
              #     state $b //= ccc();
              # The solution is to always accept the digraph (or trigraph) after
              # token type 'Z' (possible file handle).  The reason is that
              # sub operator_expected gives TERM expected here, which is
              # wrong in this case.
                if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
                    my $next_type = $rtokens->[ $i + 1 ];
                    my $expecting =
                      operator_expected( [ $prev_type, $tok, $next_type ] );

# Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
                    $combine_ok = 0 if ( $expecting == TERM );
                }

# Patch for RT #114359: Missparsing of "print $x ** 0.5;
                # Accept the digraphs '**' only after type 'Z'
                # Otherwise postpone the decision.
                if ( $test_tok eq '**' ) {
                    if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
                }
            }

if (
                $combine_ok

&& ( $test_tok ne '/=' )    # might be pattern
                && ( $test_tok ne 'x=' )    # might be $x
                && ( $test_tok ne '*=' )    # typeglob?

# Moved above as part of fix for
                # RT #114359: Missparsing of "print $x ** 0.5;
                # && ( $test_tok ne '**' )    # typeglob?
              )
            {
                $tok = $test_tok;
                $i++;

# Now try to assemble trigraphs.  Note that all possible
                # perl trigraphs can be constructed by appending a character
                # to a digraph.
                $test_tok = $tok . $rtokens->[ $i + 1 ];

if ( $is_trigraph{$test_tok} ) {
                    $tok = $test_tok;
                    $i++;
                }

# The only current tetragraph is the double diamond operator
                # and its first three characters are not a trigraph, so
                # we do can do a special test for it
                elsif ( $test_tok eq '<<>' ) {
                    $test_tok .= $rtokens->[ $i + 2 ];
                    if ( $is_tetragraph{$test_tok} ) {
                        $tok = $test_tok;
                        $i += 2;
                    }
                }
            }

$type      = $tok;
            $next_tok  = $rtokens->[ $i + 1 ];
            $next_type = $rtoken_type->[ $i + 1 ];

DEBUG_TOKENIZE && do {
                local $" = ')(';
                my @debug_list = (
                    $last_nonblank_token,      $tok,
                    $next_tok,                 $brace_depth,
                    $brace_type[$brace_depth], $paren_depth,
                    $paren_type[$paren_depth]
                );
                print STDOUT "TOKENIZE:(@debug_list)\n";
            };

# turn off attribute list on first non-blank, non-bareword
            if ( $pre_type ne 'w' ) { $in_attribute_list = 0 }

###############################################################
            # We have the next token, $tok.
            # Now we have to examine this token and decide what it is
            # and define its $type
            #
            # section 1: bare words
            ###############################################################

if ( $pre_type eq 'w' ) {
                $expecting =
                  operator_expected( [ $prev_type, $tok, $next_type ] );
                my ( $next_nonblank_token, $i_next ) =
                  find_next_nonblank_token( $i, $rtokens, $max_token_index );

# ATTRS: handle sub and variable attributes
                if ($in_attribute_list) {

# treat bare word followed by open paren like qw(
                    if ( $next_nonblank_token eq '(' ) {

# For something like:
                        #     : prototype($$)
                        # we should let do_scan_sub see it so that it can see
                        # the prototype.  All other attributes get parsed as a
                        # quoted string.
                        if ( $tok eq 'prototype' ) {
                            $id_scan_state = 'prototype';

# start just after the word 'prototype'
                            my $i_beg = $i + 1;
                            ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
                                {
                                    input_line      => $input_line,
                                    i               => $i,
                                    i_beg           => $i_beg,
                                    tok             => $tok,
                                    type            => $type,
                                    rtokens         => $rtokens,
                                    rtoken_map      => $rtoken_map,
                                    id_scan_state   => $id_scan_state,
                                    max_token_index => $max_token_index
                                }
                            );

# If successful, mark as type 'q' to be consistent with other
                   # attributes.  Note that type 'w' would also work.
                            if ( $i > $i_beg ) {
                                $type = 'q';
                                next;
                            }

# If not successful, continue and parse as a quote.
                        }

# All other attribute lists must be parsed as quotes
                        # (see 'signatures.t' for good examples)
                        $in_quote                = $quote_items{'q'};
                        $allowed_quote_modifiers = $quote_modifiers{'q'};
                        $type                    = 'q';
                        $quote_type              = 'q';
                        next;
                    }

# handle bareword not followed by open paren
                    else {
                        $type = 'w';
                        next;
                    }
                }

# quote a word followed by => operator
                # unless the word __END__ or __DATA__ and the only word on
                # the line.
                if ( !$is_END_or_DATA && $next_nonblank_token eq '=' ) {

if ( $rtokens->[ $i_next + 1 ] eq '>' ) {
                        if ( $is_constant{$current_package}{$tok} ) {
                            $type = 'C';
                        }
                        elsif ( $is_user_function{$current_package}{$tok} ) {
                            $type = 'U';
                            $prototype =
                              $user_function_prototype{$current_package}{$tok};
                        }
                        elsif ( $tok =~ /^v\d+$/ ) {
                            $type = 'v';
                            report_v_string($tok);
                        }
                        else {

# Bareword followed by a fat comma ... see 'git18.in'
                           # If tok is something like 'x17' then it could
                           # actually be operator x followed by number 17.
                           # For example, here:
                           #     123x17 => [ 792, 1224 ],
                           # (a key of 123 repeated 17 times, perhaps not
                           # what was intended). We will mark x17 as type
                           # 'n' and it will be split. If the previous token
                           # was also a bareword then it is not very clear is
                           # going on.  In this case we will not be sure that
                           # an operator is expected, so we just mark it as a
                           # bareword.  Perl is a little murky in what it does
                           # with stuff like this, and its behavior can change
                           # over time.  Something like
                           #    a x18 => [792, 1224], will compile as
                           # a key with 18 a's.  But something like
                           #    push @array, a x18;
                           # is a syntax error.
                            if ( $expecting == OPERATOR && $tok =~ /^x\d+$/ ) {
                                $type = 'n';
                            }
                            else {

# git #18
                                $type = 'w';
                                error_if_expecting_OPERATOR();
                            }
                        }

next;
                    }
                }

# quote a bare word within braces..like xxx->{s}; note that we
     # must be sure this is not a structural brace, to avoid
     # mistaking {s} in the following for a quoted bare word:
     #     for(@[){s}bla}BLA}
     # Also treat q in something like var{-q} as a bare word, not qoute operator
                if (
                    $next_nonblank_token eq '}'
                    && (
                        $last_nonblank_type eq 'L'
                        || (   $last_nonblank_type eq 'm'
                            && $last_last_nonblank_type eq 'L' )
                    )
                  )
                {
                    $type = 'w';
                    next;
                }

# a bare word immediately followed by :: is not a keyword;
                # use $tok_kw when testing for keywords to avoid a mistake
                my $tok_kw = $tok;
                if (   $rtokens->[ $i + 1 ] eq ':'
                    && $rtokens->[ $i + 2 ] eq ':' )
                {
                    $tok_kw .= '::';
                }

# handle operator x (now we know it isn't $x=)
                if (   $expecting == OPERATOR
                    && substr( $tok, 0, 1 ) eq 'x'
                    && $tok =~ /^x\d*$/ )
                {
                    if ( $tok eq 'x' ) {

if ( $rtokens->[ $i + 1 ] eq '=' ) {    # x=
                            $tok  = 'x=';
                            $type = $tok;
                            $i++;
                        }
                        else {
                            $type = 'x';
                        }
                    }

# FIXME: Patch: mark something like x4 as an integer for now
                    # It gets fixed downstream.  This is easier than
                    # splitting the pretoken.
                    else {
                        $type = 'n';
                    }
                }
                elsif ( $tok_kw eq 'CORE::' ) {
                    $type = $tok = $tok_kw;
                    $i += 2;
                }
                elsif ( ( $tok eq 'strict' )
                    and ( $last_nonblank_token eq 'use' ) )
                {
                    $tokenizer_self->[_saw_use_strict_] = 1;
                    scan_bare_identifier();
                }

elsif ( ( $tok eq 'warnings' )
                    and ( $last_nonblank_token eq 'use' ) )
                {
                    $tokenizer_self->[_saw_perl_dash_w_] = 1;

# scan as identifier, so that we pick up something like:
                    # use warnings::register
                    scan_bare_identifier();
                }

elsif (
                       $tok eq 'AutoLoader'
                    && $tokenizer_self->[_look_for_autoloader_]
                    && (
                        $last_nonblank_token eq 'use'

# these regexes are from AutoSplit.pm, which we want
                        # to mimic
                        || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
                        || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
                    )
                  )
                {
                    write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
                    $tokenizer_self->[_saw_autoloader_]      = 1;
                    $tokenizer_self->[_look_for_autoloader_] = 0;
                    scan_bare_identifier();
                }

elsif (
                       $tok eq 'SelfLoader'
                    && $tokenizer_self->[_look_for_selfloader_]
                    && (   $last_nonblank_token eq 'use'
                        || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
                        || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
                  )
                {
                    write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
                    $tokenizer_self->[_saw_selfloader_]      = 1;
                    $tokenizer_self->[_look_for_selfloader_] = 0;
                    scan_bare_identifier();
                }

elsif ( ( $tok eq 'constant' )
                    and ( $last_nonblank_token eq 'use' ) )
                {
                    scan_bare_identifier();
                    my ( $next_nonblank_token, $i_next ) =
                      find_next_nonblank_token( $i, $rtokens,
                        $max_token_index );

if ($next_nonblank_token) {

if ( $is_keyword{$next_nonblank_token} ) {

# Assume qw is used as a quote and okay, as in:
                            #  use constant qw{ DEBUG 0 };
                            # Not worth trying to parse for just a warning

# NOTE: This warning is deactivated because recent
                            # versions of perl do not complain here, but
                            # the coding is retained for reference.
                            if ( 0 && $next_nonblank_token ne 'qw' ) {
                                warning(
"Attempting to define constant '$next_nonblank_token' which is a perl keyword\n"
                                );
                            }
                        }

# FIXME: could check for error in which next token is
                        # not a word (number, punctuation, ..)
                        else {
                            $is_constant{$current_package}{$next_nonblank_token}
                              = 1;
                        }
                    }
                }

# various quote operators
                elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
##NICOL PATCH
                    if ( $expecting == OPERATOR ) {

# Be careful not to call an error for a qw quote
                        # where a parenthesized list is allowed.  For example,
                        # it could also be a for/foreach construct such as
                        #
                        #    foreach my $key qw\Uno Due Tres Quadro\ {
                        #        print "Set $key\n";
                        #    }
                        #

# Or it could be a function call.
                        # NOTE: Braces in something like &{ xxx } are not
                        # marked as a block, we might have a method call.
                        # &method(...), $method->(..), &{method}(...),
                        # $ref[2](list) is ok & short for $ref[2]->(list)
                        #
                        # See notes in 'sub code_block_type' and
                        # 'sub is_non_structural_brace'

unless (
                            $tok eq 'qw'
                            && (   $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
                                || $is_for_foreach{$want_paren} )
                          )
                        {
                            error_if_expecting_OPERATOR();
                        }
                    }
                    $in_quote                = $quote_items{$tok};
                    $allowed_quote_modifiers = $quote_modifiers{$tok};

# All quote types are 'Q' except possibly qw quotes.
                   # qw quotes are special in that they may generally be trimmed
                   # of leading and trailing whitespace.  So they are given a
                   # separate type, 'q', unless requested otherwise.
                    $type =
                      ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
                      ? 'q'
                      : 'Q';
                    $quote_type = $type;
                }

# check for a statement label
                elsif (
                       ( $next_nonblank_token eq ':' )
                    && ( $rtokens->[ $i_next + 1 ] ne ':' )
                    && ( $i_next <= $max_token_index )    # colon on same line
                    && label_ok()
                  )
                {
                    if ( $tok !~ /[A-Z]/ ) {
                        push @{ $tokenizer_self->[_rlower_case_labels_at_] },
                          $input_line_number;
                    }
                    $type = 'J';
                    $tok .= ':';
                    $i = $i_next;
                    next;
                }

#      'sub' or alias
                elsif ( $is_sub{$tok_kw} ) {
                    error_if_expecting_OPERATOR()
                      if ( $expecting == OPERATOR );
                    initialize_subname();
                    scan_id();
                }

#      'package'
                elsif ( $is_package{$tok_kw} ) {
                    error_if_expecting_OPERATOR()
                      if ( $expecting == OPERATOR );
                    scan_id();
                }

# Note on token types for format, __DATA__, __END__:
                # It simplifies things to give these type ';', so that when we
                # start rescanning we will be expecting a token of type TERM.
                # We will switch to type 'k' before outputting the tokens.
                elsif ( $is_format_END_DATA{$tok_kw} ) {
                    $type = ';';    # make tokenizer look for TERM next

# Remember that we are in one of these three sections
                    $tokenizer_self->[ $is_format_END_DATA{$tok_kw} ] = 1;
                    last;
                }

elsif ( $is_keyword{$tok_kw} ) {
                    $type = 'k';

# Since for and foreach may not be followed immediately
                    # by an opening paren, we have to remember which keyword
                    # is associated with the next '('
                    if ( $is_for_foreach{$tok} ) {
                        if ( new_statement_ok() ) {
                            $want_paren = $tok;
                        }
                    }

# recognize 'use' statements, which are special
                    elsif ( $is_use_require{$tok} ) {
                        $statement_type = $tok;
                        error_if_expecting_OPERATOR()
                          if ( $expecting == OPERATOR );
                    }

# remember my and our to check for trailing ": shared"
                    elsif ( $is_my_our_state{$tok} ) {
                        $statement_type = $tok;
                    }

# Check for misplaced 'elsif' and 'else', but allow isolated
                    # else or elsif blocks to be formatted.  This is indicated
                    # by a last noblank token of ';'
                    elsif ( $tok eq 'elsif' ) {
                        if (   $last_nonblank_token ne ';'
                            && $last_nonblank_block_type !~
                            /^(if|elsif|unless)$/ )
                        {
                            warning(
"expecting '$tok' to follow one of 'if|elsif|unless'\n"
                            );
                        }
                    }
                    elsif ( $tok eq 'else' ) {

# patched for SWITCH/CASE
                        if (
                               $last_nonblank_token ne ';'
                            && $last_nonblank_block_type !~
                            /^(if|elsif|unless|case|when)$/

# patch to avoid an unwanted error message for
                            # the case of a parenless 'case' (RT 105484):
                            # switch ( 1 ) { case x { 2 } else { } }
                            && $statement_type !~
                            /^(if|elsif|unless|case|when)$/
                          )
                        {
                            warning(
"expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
                            );
                        }
                    }
                    elsif ( $tok eq 'continue' ) {
                        if (   $last_nonblank_token ne ';'
                            && $last_nonblank_block_type !~
                            /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
                        {

# note: ';' '{' and '}' in list above
                            # because continues can follow bare blocks;
                            # ':' is labeled block
                            #
                            ############################################
                            # NOTE: This check has been deactivated because
                            # continue has an alternative usage for given/when
                            # blocks in perl 5.10
                            ## warning("'$tok' should follow a block\n");
                            ############################################
                        }
                    }

# patch for SWITCH/CASE if 'case' and 'when are
                    # treated as keywords.  Also 'default' for Switch::Plain
                    elsif ($tok eq 'when'
                        || $tok eq 'case'
                        || $tok eq 'default' )
                    {
                        $statement_type = $tok;    # next '{' is block
                    }

#
                    # indent trailing if/unless/while/until
                    # outdenting will be handled by later indentation loop
## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
##$opt_o = 1
##  if !(
##             $opt_b
##          || $opt_c
##          || $opt_d
##          || $opt_f
##          || $opt_i
##          || $opt_l
##          || $opt_o
##          || $opt_x
##  );
##                    if (   $tok =~ /^(if|unless|while|until)$/
##                        && $next_nonblank_token ne '(' )
##                    {
##                        $indent_flag = 1;
##                    }
                }

# check for inline label following
                #         /^(redo|last|next|goto)$/
                elsif (( $last_nonblank_type eq 'k' )
                    && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
                {
                    $type = 'j';
                    next;
                }

# something else --
                else {

scan_bare_identifier();

if (   $statement_type eq 'use'
                        && $last_nonblank_token eq 'use' )
                    {
                        $saw_use_module{$current_package}->{$tok} = 1;
                    }

if ( $type eq 'w' ) {

if ( $expecting == OPERATOR ) {

# Patch to avoid error message for RPerl overloaded
                            # operator functions: use overload
                            #    '+' => \&sse_add,
                            #    '-' => \&sse_sub,
                            #    '*' => \&sse_mul,
                            #    '/' => \&sse_div;
                            # FIXME: this should eventually be generalized
                            if (   $saw_use_module{$current_package}->{'RPerl'}
                                && $tok =~ /^sse_(mul|div|add|sub)$/ )
                            {

}

# don't complain about possible indirect object
                            # notation.
                            # For example:
                            #   package main;
                            #   sub new($) { ... }
                            #   $b = new A::;  # calls A::new
                            #   $c = new A;    # same thing but suspicious
                            # This will call A::new but we have a 'new' in
                            # main:: which looks like a constant.
                            #
                            elsif ( $last_nonblank_type eq 'C' ) {
                                if ( $tok !~ /::$/ ) {
                                    complain(<<EOM);
Expecting operator after '$last_nonblank_token' but found bare word '$tok'
       Maybe indirectet object notation?
EOM
                                }
                            }
                            else {
                                error_if_expecting_OPERATOR("bareword");
                            }
                        }

# mark bare words immediately followed by a paren as
                        # functions
                        $next_tok = $rtokens->[ $i + 1 ];
                        if ( $next_tok eq '(' ) {
                            $type = 'U';
                        }

# underscore after file test operator is file handle
                        if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
                            $type = 'Z';
                        }

# patch for SWITCH/CASE if 'case' and 'when are
                        # not treated as keywords:
                        if (
                            (
                                   $tok eq 'case'
                                && $brace_type[$brace_depth] eq 'switch'
                            )
                            || (   $tok eq 'when'
                                && $brace_type[$brace_depth] eq 'given' )
                          )
                        {
                            $statement_type = $tok;    # next '{' is block
                            $type           = 'k'; # for keyword syntax coloring
                        }

# patch for SWITCH/CASE if switch and given not keywords
                        # Switch is not a perl 5 keyword, but we will gamble
                        # and mark switch followed by paren as a keyword.  This
                        # is only necessary to get html syntax coloring nice,
                        # and does not commit this as being a switch/case.
                        if ( $next_nonblank_token eq '('
                            && ( $tok eq 'switch' || $tok eq 'given' ) )
                        {
                            $type = 'k';    # for keyword syntax coloring
                        }
                    }
                }
            }

###############################################################
            # section 2: strings of digits
            ###############################################################
            elsif ( $pre_type eq 'd' ) {
                $expecting =
                  operator_expected( [ $prev_type, $tok, $next_type ] );
                error_if_expecting_OPERATOR("Number")
                  if ( $expecting == OPERATOR );

my $number = scan_number_fast();
                if ( !defined($number) ) {

# shouldn't happen - we should always get a number
                    warning("non-number beginning with digit--program bug\n");
                    report_definite_bug();
                }
            }

###############################################################
            # section 3: all other tokens
            ###############################################################

else {
                last if ( $tok eq '#' );
                my $code = $tokenization_code->{$tok};
                if ($code) {
                    $expecting =
                      operator_expected( [ $prev_type, $tok, $next_type ] );
                    $code->();
                    redo if $in_quote;
                }
            }
        }

# -----------------------------
        # end of main tokenization loop
        # -----------------------------

if ( $i_tok >= 0 ) {
            $routput_token_type->[$i_tok]     = $type;
            $routput_block_type->[$i_tok]     = $block_type;
            $routput_container_type->[$i_tok] = $container_type;
            $routput_type_sequence->[$i_tok]  = $type_sequence;
            $routput_indent_flag->[$i_tok]    = $indent_flag;
        }

unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
            $last_last_nonblank_token          = $last_nonblank_token;
            $last_last_nonblank_type           = $last_nonblank_type;
            $last_last_nonblank_block_type     = $last_nonblank_block_type;
            $last_last_nonblank_container_type = $last_nonblank_container_type;
            $last_last_nonblank_type_sequence  = $last_nonblank_type_sequence;
            $last_nonblank_token               = $tok;
            $last_nonblank_type                = $type;
            $last_nonblank_block_type          = $block_type;
            $last_nonblank_container_type      = $container_type;
            $last_nonblank_type_sequence       = $type_sequence;
            $last_nonblank_prototype           = $prototype;
        }

# reset indentation level if necessary at a sub or package
        # in an attempt to recover from a nesting error
        if ( $level_in_tokenizer < 0 ) {
            if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
                reset_indentation_level(0);
                brace_warning("resetting level to 0 at $1 $2\n");
            }
        }

# all done tokenizing this line ...
        # now prepare the final list of tokens and types

my @token_type     = ();   # stack of output token types
        my @block_type     = ();   # stack of output code block types
        my @container_type = ();   # stack of output code container types
        my @type_sequence  = ();   # stack of output type sequence numbers
        my @tokens         = ();   # output tokens
        my @levels         = ();   # structural brace levels of output tokens
        my @slevels        = ();   # secondary nesting levels of output tokens
        my @nesting_tokens = ();   # string of tokens leading to this depth
        my @nesting_types  = ();   # string of token types leading to this depth
        my @nesting_blocks = ();   # string of block types leading to this depth
        my @nesting_lists  = ();   # string of list types leading to this depth
        my @ci_string = ();  # string needed to compute continuation indentation
        my @container_environment = ();    # BLOCK or LIST
        my $container_environment = '';
        my $im                    = -1;    # previous $i value
        my $num;

# Count the number of '1's in the string (previously sub ones_count)
        my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;

# Computing Token Indentation
#
#     The final section of the tokenizer forms tokens and also computes
#     parameters needed to find indentation.  It is much easier to do it
#     in the tokenizer than elsewhere.  Here is a brief description of how
#     indentation is computed.  Perl::Tidy computes indentation as the sum
#     of 2 terms:
#
#     (1) structural indentation, such as if/else/elsif blocks
#     (2) continuation indentation, such as long parameter call lists.
#
#     These are occasionally called primary and secondary indentation.
#
#     Structural indentation is introduced by tokens of type '{', although
#     the actual tokens might be '{', '(', or '['.  Structural indentation
#     is of two types: BLOCK and non-BLOCK.  Default structural indentation
#     is 4 characters if the standard indentation scheme is used.
#
#     Continuation indentation is introduced whenever a line at BLOCK level
#     is broken before its termination.  Default continuation indentation
#     is 2 characters in the standard indentation scheme.
#
#     Both types of indentation may be nested arbitrarily deep and
#     interlaced.  The distinction between the two is somewhat arbitrary.
#
#     For each token, we will define two variables which would apply if
#     the current statement were broken just before that token, so that
#     that token started a new line:
#
#     $level = the structural indentation level,
#     $ci_level = the continuation indentation level
#
#     The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
#     assuming defaults.  However, in some special cases it is customary
#     to modify $ci_level from this strict value.
#
#     The total structural indentation is easy to compute by adding and
#     subtracting 1 from a saved value as types '{' and '}' are seen.  The
#     running value of this variable is $level_in_tokenizer.
#
#     The total continuation is much more difficult to compute, and requires
#     several variables.  These variables are:
#
#     $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
#       each indentation level, if there are intervening open secondary
#       structures just prior to that level.
#     $continuation_string_in_tokenizer = a string of 1's and 0's indicating
#       if the last token at that level is "continued", meaning that it
#       is not the first token of an expression.
#     $nesting_block_string = a string of 1's and 0's indicating, for each
#       indentation level, if the level is of type BLOCK or not.
#     $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
#     $nesting_list_string = a string of 1's and 0's indicating, for each
#       indentation level, if it is appropriate for list formatting.
#       If so, continuation indentation is used to indent long list items.
#     $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
#     @{$rslevel_stack} = a stack of total nesting depths at each
#       structural indentation level, where "total nesting depth" means
#       the nesting depth that would occur if every nesting token -- '{', '[',
#       and '(' -- , regardless of context, is used to compute a nesting
#       depth.

#my $nesting_block_flag = ($nesting_block_string =~ /1$/);
        #my $nesting_list_flag = ($nesting_list_string =~ /1$/);

my ( $ci_string_i, $level_i, $nesting_block_string_i,
            $nesting_list_string_i, $nesting_token_string_i,
            $nesting_type_string_i, );

foreach my $i ( @{$routput_token_list} )
        {    # scan the list of pre-tokens indexes

# self-checking for valid token types
            my $type                    = $routput_token_type->[$i];
            my $forced_indentation_flag = $routput_indent_flag->[$i];

# See if we should undo the $forced_indentation_flag.
            # Forced indentation after 'if', 'unless', 'while' and 'until'
            # expressions without trailing parens is optional and doesn't
            # always look good.  It is usually okay for a trailing logical
            # expression, but if the expression is a function call, code block,
            # or some kind of list it puts in an unwanted extra indentation
            # level which is hard to remove.
            #
            # Example where extra indentation looks ok:
            # return 1
            #   if $det_a < 0 and $det_b > 0
            #       or $det_a > 0 and $det_b < 0;
            #
            # Example where extra indentation is not needed because
            # the eval brace also provides indentation:
            # print "not " if defined eval {
            #     reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
            # };
            #
            # The following rule works fairly well:
            #   Undo the flag if the end of this line, or start of the next
            #   line, is an opening container token or a comma.
            # This almost always works, but if not after another pass it will
            # be stable.
            if ( $forced_indentation_flag && $type eq 'k' ) {
                my $ixlast  = -1;
                my $ilast   = $routput_token_list->[$ixlast];
                my $toklast = $routput_token_type->[$ilast];
                if ( $toklast eq '#' ) {
                    $ixlast--;
                    $ilast   = $routput_token_list->[$ixlast];
                    $toklast = $routput_token_type->[$ilast];
                }
                if ( $toklast eq 'b' ) {
                    $ixlast--;
                    $ilast   = $routput_token_list->[$ixlast];
                    $toklast = $routput_token_type->[$ilast];
                }
                if ( $toklast =~ /^[\{,]$/ ) {
                    $forced_indentation_flag = 0;
                }
                else {
                    ( $toklast, my $i_next ) =
                      find_next_nonblank_token( $max_token_index, $rtokens,
                        $max_token_index );
                    if ( $toklast =~ /^[\{,]$/ ) {
                        $forced_indentation_flag = 0;
                    }
                }
            }

# if we are already in an indented if, see if we should outdent
            if ($indented_if_level) {

# don't try to nest trailing if's - shouldn't happen
                if ( $type eq 'k' ) {
                    $forced_indentation_flag = 0;
                }

# check for the normal case - outdenting at next ';'
                elsif ( $type eq ';' ) {
                    if ( $level_in_tokenizer == $indented_if_level ) {
                        $forced_indentation_flag = -1;
                        $indented_if_level       = 0;
                    }
                }

# handle case of missing semicolon
                elsif ( $type eq '}' ) {
                    if ( $level_in_tokenizer == $indented_if_level ) {
                        $indented_if_level = 0;

# TBD: This could be a subroutine call
                        $level_in_tokenizer--;
                        if ( @{$rslevel_stack} > 1 ) {
                            pop( @{$rslevel_stack} );
                        }
                        if ( length($nesting_block_string) > 1 )
                        {    # true for valid script
                            chop $nesting_block_string;
                            chop $nesting_list_string;
                        }

}
                }
            }

my $tok = $rtokens->[$i];  # the token, but ONLY if same as pretoken
            $level_i = $level_in_tokenizer;

# This can happen by running perltidy on non-scripts
            # although it could also be bug introduced by programming change.
            # Perl silently accepts a 032 (^Z) and takes it as the end
            if ( !$is_valid_token_type{$type} ) {
                my $val = ord($type);
                warning(
                    "unexpected character decimal $val ($type) in script\n");
                $tokenizer_self->[_in_error_] = 1;
            }

# ----------------------------------------------------------------
            # TOKEN TYPE PATCHES
            #  output __END__, __DATA__, and format as type 'k' instead of ';'
            # to make html colors correct, etc.
            my $fix_type = $type;
            if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }

# output anonymous 'sub' as keyword
            if ( $type eq 't' && $is_sub{$tok} ) { $fix_type = 'k' }

# -----------------------------------------------------------------

$nesting_token_string_i = $nesting_token_string;
            $nesting_type_string_i  = $nesting_type_string;
            $nesting_block_string_i = $nesting_block_string;
            $nesting_list_string_i  = $nesting_list_string;

# set primary indentation levels based on structural braces
            # Note: these are set so that the leading braces have a HIGHER
            # level than their CONTENTS, which is convenient for indentation
            # Also, define continuation indentation for each token.
            if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
            {

# use environment before updating
                $container_environment =
                    $nesting_block_flag ? 'BLOCK'
                  : $nesting_list_flag  ? 'LIST'
                  :                       "";

# if the difference between total nesting levels is not 1,
                # there are intervening non-structural nesting types between
                # this '{' and the previous unclosed '{'
                my $intervening_secondary_structure = 0;
                if ( @{$rslevel_stack} ) {
                    $intervening_secondary_structure =
                      $slevel_in_tokenizer - $rslevel_stack->[-1];
                }

# Continuation Indentation
     #
     # Having tried setting continuation indentation both in the formatter and
     # in the tokenizer, I can say that setting it in the tokenizer is much,
     # much easier.  The formatter already has too much to do, and can't
     # make decisions on line breaks without knowing what 'ci' will be at
     # arbitrary locations.
     #
     # But a problem with setting the continuation indentation (ci) here
     # in the tokenizer is that we do not know where line breaks will actually
     # be.  As a result, we don't know if we should propagate continuation
     # indentation to higher levels of structure.
     #
     # For nesting of only structural indentation, we never need to do this.
     # For example, in a long if statement, like this
     #
     #   if ( !$output_block_type[$i]
     #     && ($in_statement_continuation) )
     #   {           <--outdented
     #       do_something();
     #   }
     #
     # the second line has ci but we do normally give the lines within the BLOCK
     # any ci.  This would be true if we had blocks nested arbitrarily deeply.
     #
     # But consider something like this, where we have created a break after
     # an opening paren on line 1, and the paren is not (currently) a
     # structural indentation token:
     #
     # my $file = $menubar->Menubutton(
     #   qw/-text File -underline 0 -menuitems/ => [
     #       [
     #           Cascade    => '~View',
     #           -menuitems => [
     #           ...
     #
     # The second line has ci, so it would seem reasonable to propagate it
     # down, giving the third line 1 ci + 1 indentation.  This suggests the
     # following rule, which is currently used to propagating ci down: if there
     # are any non-structural opening parens (or brackets, or braces), before
     # an opening structural brace, then ci is propagated down, and otherwise
     # not.  The variable $intervening_secondary_structure contains this
     # information for the current token, and the string
     # "$ci_string_in_tokenizer" is a stack of previous values of this
     # variable.

# save the current states
                push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
                $level_in_tokenizer++;

if ( $level_in_tokenizer > $tokenizer_self->[_maximum_level_] )
                {
                    $tokenizer_self->[_maximum_level_] = $level_in_tokenizer;
                }

if ($forced_indentation_flag) {

# break BEFORE '?' when there is forced indentation
                    if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
                    if ( $type eq 'k' ) {
                        $indented_if_level = $level_in_tokenizer;
                    }

# do not change container environment here if we are not
                    # at a real list. Adding this check prevents "blinkers"
                    # often near 'unless" clauses, such as in the following
                    # code:
##          next
##            unless -e (
##                    $archive =
##                      File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
##            );

$nesting_block_string .= "$nesting_block_flag";
                }
                else {

if ( $routput_block_type->[$i] ) {
                        $nesting_block_flag = 1;
                        $nesting_block_string .= '1';
                    }
                    else {
                        $nesting_block_flag = 0;
                        $nesting_block_string .= '0';
                    }
                }

# we will use continuation indentation within containers
                # which are not blocks and not logical expressions
                my $bit = 0;
                if ( !$routput_block_type->[$i] ) {

# propagate flag down at nested open parens
                    if ( $routput_container_type->[$i] eq '(' ) {
                        $bit = 1 if $nesting_list_flag;
                    }

# use list continuation if not a logical grouping
                  # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
                    else {
                        $bit = 1
                          unless
                          $is_logical_container{ $routput_container_type->[$i]
                          };
                    }
                }
                $nesting_list_string .= $bit;
                $nesting_list_flag = $bit;

$ci_string_in_tokenizer .=
                  ( $intervening_secondary_structure != 0 ) ? '1' : '0';
                $ci_string_sum =
                  ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
                $continuation_string_in_tokenizer .=
                  ( $in_statement_continuation > 0 ) ? '1' : '0';

#  Sometimes we want to give an opening brace continuation indentation,
   #  and sometimes not.  For code blocks, we don't do it, so that the leading
   #  '{' gets outdented, like this:
   #
   #   if ( !$output_block_type[$i]
   #     && ($in_statement_continuation) )
   #   {           <--outdented
   #
   #  For other types, we will give them continuation indentation.  For example,
   #  here is how a list looks with the opening paren indented:
   #
   #     @LoL =
   #       ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
   #         [ "homer", "marge", "bart" ], );
   #
   #  This looks best when 'ci' is one-half of the indentation  (i.e., 2 and 4)

my $total_ci = $ci_string_sum;
                if (
                    !$routput_block_type->[$i]    # patch: skip for BLOCK
                    && ($in_statement_continuation)
                    && !( $forced_indentation_flag && $type eq ':' )
                  )
                {
                    $total_ci += $in_statement_continuation
                      unless ( substr( $ci_string_in_tokenizer, -1 ) eq '1' );
                }

$ci_string_i               = $total_ci;
                $in_statement_continuation = 0;
            }

elsif ($type eq '}'
                || $type eq 'R'
                || $forced_indentation_flag < 0 )
            {

# only a nesting error in the script would prevent popping here
                if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }

$level_i = --$level_in_tokenizer;

# restore previous level values
                if ( length($nesting_block_string) > 1 )
                {    # true for valid script
                    chop $nesting_block_string;
                    $nesting_block_flag =
                      substr( $nesting_block_string, -1 ) eq '1';
                    chop $nesting_list_string;
                    $nesting_list_flag =
                      substr( $nesting_list_string, -1 ) eq '1';

chop $ci_string_in_tokenizer;
                    $ci_string_sum =
                      ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;

$in_statement_continuation =
                      chop $continuation_string_in_tokenizer;

# zero continuation flag at terminal BLOCK '}' which
                    # ends a statement.
                    if ( $routput_block_type->[$i] ) {

# ...These include non-anonymous subs
                        # note: could be sub ::abc { or sub 'abc
                        if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {

# note: older versions of perl require the /gc modifier
                         # here or else the \G does not work.
                            if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
                            {
                                $in_statement_continuation = 0;
                            }
                        }

# ...and include all block types except user subs with
# block prototypes and these: (sort|grep|map|do|eval)
# /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
                        elsif (
                            $is_zero_continuation_block_type{
                                $routput_block_type->[$i] } )
                        {
                            $in_statement_continuation = 0;
                        }

# ..but these are not terminal types:
                        #     /^(sort|grep|map|do|eval)$/ )
                        elsif (
                            $is_not_zero_continuation_block_type{
                                $routput_block_type->[$i] } )
                        {
                        }

# ..and a block introduced by a label
                        # /^\w+\s*:$/gc ) {
                        elsif ( $routput_block_type->[$i] =~ /:$/ ) {
                            $in_statement_continuation = 0;
                        }

# user function with block prototype
                        else {
                            $in_statement_continuation = 0;
                        }
                    }

# If we are in a list, then
                    # we must set continuation indentation at the closing
                    # paren of something like this (paren after $check):
                    #     assert(
                    #         __LINE__,
                    #         ( not defined $check )
                    #           or ref $check
                    #           or $check eq "new"
                    #           or $check eq "old",
                    #     );
                    elsif ( $tok eq ')' ) {
                        $in_statement_continuation = 1
                          if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
                    }

elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
                }

# use environment after updating
                $container_environment =
                    $nesting_block_flag ? 'BLOCK'
                  : $nesting_list_flag  ? 'LIST'
                  :                       "";
                $ci_string_i = $ci_string_sum + $in_statement_continuation;
                $nesting_block_string_i = $nesting_block_string;
                $nesting_list_string_i  = $nesting_list_string;
            }

# not a structural indentation type..
            else {

$container_environment =
                    $nesting_block_flag ? 'BLOCK'
                  : $nesting_list_flag  ? 'LIST'
                  :                       "";

# zero the continuation indentation at certain tokens so
                # that they will be at the same level as its container.  For
                # commas, this simplifies the -lp indentation logic, which
                # counts commas.  For ?: it makes them stand out.
                if ($nesting_list_flag) {
                    ##      $type =~ /^[,\?\:]$/
                    if ( $is_comma_question_colon{$type} ) {
                        $in_statement_continuation = 0;
                    }
                }

# be sure binary operators get continuation indentation
                if (
                    $container_environment
                    && (   $type eq 'k' && $is_binary_keyword{$tok}
                        || $is_binary_type{$type} )
                  )
                {
                    $in_statement_continuation = 1;
                }

# continuation indentation is sum of any open ci from previous
                # levels plus the current level
                $ci_string_i = $ci_string_sum + $in_statement_continuation;

# update continuation flag ...
                # if this isn't a blank or comment..
                if ( $type ne 'b' && $type ne '#' ) {

# and we are in a BLOCK
                    if ($nesting_block_flag) {

# the next token after a ';' and label starts a new stmt
                        if ( $type eq ';' || $type eq 'J' ) {
                            $in_statement_continuation = 0;
                        }

# otherwise, we are continuing the current statement
                        else {
                            $in_statement_continuation = 1;
                        }
                    }

# if we are not in a BLOCK..
                    else {

# do not use continuation indentation if not list
                        # environment (could be within if/elsif clause)
                        if ( !$nesting_list_flag ) {
                            $in_statement_continuation = 0;
                        }

# otherwise, the token after a ',' starts a new term

# Patch FOR RT#99961; no continuation after a ';'
                        # This is needed because perltidy currently marks
                        # a block preceded by a type character like % or @
                        # as a non block, to simplify formatting. But these
                        # are actually blocks and can have semicolons.
                        # See code_block_type() and is_non_structural_brace().
                        elsif ( $type eq ',' || $type eq ';' ) {
                            $in_statement_continuation = 0;
                        }

# otherwise, we are continuing the current term
                        else {
                            $in_statement_continuation = 1;
                        }
                    }
                }
            }

if ( $level_in_tokenizer < 0 ) {
                unless ( $tokenizer_self->[_saw_negative_indentation_] ) {
                    $tokenizer_self->[_saw_negative_indentation_] = 1;
                    warning("Starting negative indentation\n");
                }
            }

# set secondary nesting levels based on all containment token types
            # Note: these are set so that the nesting depth is the depth
            # of the PREVIOUS TOKEN, which is convenient for setting
            # the strength of token bonds
            my $slevel_i = $slevel_in_tokenizer;

#    /^[L\{\(\[]$/
            if ( $is_opening_type{$type} ) {
                $slevel_in_tokenizer++;
                $nesting_token_string .= $tok;
                $nesting_type_string  .= $type;
            }

#       /^[R\}\)\]]$/
            elsif ( $is_closing_type{$type} ) {
                $slevel_in_tokenizer--;
                my $char = chop $nesting_token_string;

if ( $char ne $matching_start_token{$tok} ) {
                    $nesting_token_string .= $char . $tok;
                    $nesting_type_string  .= $type;
                }
                else {
                    chop $nesting_type_string;
                }
            }

push( @block_type,            $routput_block_type->[$i] );
            push( @ci_string,             $ci_string_i );
            push( @container_environment, $container_environment );
            push( @container_type,        $routput_container_type->[$i] );
            push( @levels,                $level_i );
            push( @nesting_tokens,        $nesting_token_string_i );
            push( @nesting_types,         $nesting_type_string_i );
            push( @slevels,               $slevel_i );
            push( @token_type,            $fix_type );
            push( @type_sequence,         $routput_type_sequence->[$i] );
            push( @nesting_blocks,        $nesting_block_string );
            push( @nesting_lists,         $nesting_list_string );

# now form the previous token
            if ( $im >= 0 ) {
                $num =
                  $rtoken_map->[$i] - $rtoken_map->[$im];  # how many characters

if ( $num > 0 ) {
                    push( @tokens,
                        substr( $input_line, $rtoken_map->[$im], $num ) );
                }
            }
            $im = $i;
        }

$num = length($input_line) - $rtoken_map->[$im];   # make the last token
        if ( $num > 0 ) {
            push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
        }

$tokenizer_self->[_in_attribute_list_] = $in_attribute_list;
        $tokenizer_self->[_in_quote_]          = $in_quote;
        $tokenizer_self->[_quote_target_] =
          $in_quote ? matching_end_token($quote_character) : "";
        $tokenizer_self->[_rhere_target_list_] = $rhere_target_list;

$line_of_tokens->{_rtoken_type}            = \@token_type;
        $line_of_tokens->{_rtokens}                = \@tokens;
        $line_of_tokens->{_rblock_type}            = \@block_type;
        $line_of_tokens->{_rcontainer_type}        = \@container_type;
        $line_of_tokens->{_rcontainer_environment} = \@container_environment;
        $line_of_tokens->{_rtype_sequence}         = \@type_sequence;
        $line_of_tokens->{_rlevels}                = \@levels;
        $line_of_tokens->{_rslevels}               = \@slevels;
        $line_of_tokens->{_rnesting_tokens}        = \@nesting_tokens;
        $line_of_tokens->{_rci_levels}             = \@ci_string;
        $line_of_tokens->{_rnesting_blocks}        = \@nesting_blocks;

return;
    }
}    # end tokenize_this_line

#########i#############################################################
# Tokenizer routines which assist in identifying token types
#######################################################################

# hash lookup table of operator expected values
my %op_expected_table;

BEGIN {

# Always expecting TERM following these types:
    # note: this is identical to '@value_requestor_type' defined later.
    my @q = qw(
      ; ! + x & ?  F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
      || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
      &= // >> ~. &. |. ^.
      ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
    );
    push @q, ',';
    push @q, '(';    # for completeness, not currently a token type
    @{op_expected_table}{@q} = (TERM) x scalar(@q);

# Always UNKNOWN following these types:
    @q = qw( w );
    @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);

# Always expecting OPERATOR ...
    # 'n' and 'v' are currently excluded because they might be VERSION numbers
    # 'i' is currently excluded because it might be a package
    # 'q' is currently excluded because it might be a prototype
    @q = qw( -- C -> h R ++ ] Q <> );    ## n v q i );
    push @q, ')';
    @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);

}

sub operator_expected {

# Returns a parameter indicating what types of tokens can occur next

# Call format:
    #    $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
    # where
    #    $prev_type is the type of the previous token (blank or not)
    #    $tok is the current token
    #    $next_type is the type of the next token (blank or not)

# Many perl symbols have two or more meanings.  For example, '<<'
    # can be a shift operator or a here-doc operator.  The
    # interpretation of these symbols depends on the current state of
    # the tokenizer, which may either be expecting a term or an
    # operator.  For this example, a << would be a shift if an OPERATOR
    # is expected, and a here-doc if a TERM is expected.  This routine
    # is called to make this decision for any current token.  It returns
    # one of three possible values:
    #
    #     OPERATOR - operator expected (or at least, not a term)
    #     UNKNOWN  - can't tell
    #     TERM     - a term is expected (or at least, not an operator)
    #
    # The decision is based on what has been seen so far.  This
    # information is stored in the "$last_nonblank_type" and
    # "$last_nonblank_token" variables.  For example, if the
    # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
    # if $last_nonblank_type is 'n' (numeric), we are expecting an
    # OPERATOR.
    #
    # If a UNKNOWN is returned, the calling routine must guess. A major
    # goal of this tokenizer is to minimize the possibility of returning
    # UNKNOWN, because a wrong guess can spoil the formatting of a
    # script.
    #
    # Adding NEW_TOKENS: it is critically important that this routine be
    # updated to allow it to determine if an operator or term is to be
    # expected after the new token.  Doing this simply involves adding
    # the new token character to one of the regexes in this routine or
    # to one of the hash lists
    # that it uses, which are initialized in the BEGIN section.
    # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
    # $statement_type

# When possible, token types should be selected such that we can determine
    # the 'operator_expected' value by a simple hash lookup.  If there are
    # exceptions, that is an indication that a new type is needed.

my ($rarg) = @_;

##############
    # Table lookup
    ##############

# Many types are can be obtained by a table lookup given the previous type.
    # This typically handles half or more of the calls.
    my $op_expected = $op_expected_table{$last_nonblank_type};
    goto RETURN if ( defined($op_expected) );

######################
    # Handle special cases
    ######################

$op_expected = UNKNOWN;
    my ( $prev_type, $tok, $next_type ) = @{$rarg};

# Types 'k', '}' and 'Z' depend on context
    # FIXME: Types 'i', 'n', 'v', 'q' currently also temporarily depend on
    # context but that dependence could eventually be eliminated with better
    # token type definition

# identifier...
    if ( $last_nonblank_type eq 'i' ) {
        $op_expected = OPERATOR;

# FIXME: it would be cleaner to make this a special type
        # expecting VERSION or {} after package NAMESPACE
        # TODO: maybe mark these words as type 'Y'?
        if (   $statement_type =~ /^package\b/
            && $last_nonblank_token =~ /^package\b/ )
        {
            $op_expected = TERM;
        }
    }

# keyword...
    elsif ( $last_nonblank_type eq 'k' ) {
        $op_expected = TERM;
        if ( $expecting_operator_token{$last_nonblank_token} ) {
            $op_expected = OPERATOR;
        }
        elsif ( $expecting_term_token{$last_nonblank_token} ) {

# Exceptions from TERM:

# // may follow perl functions which may be unary operators
            # see test file dor.t (defined or);
            if (   $tok eq '/'
                && $next_type eq '/'
                && $is_keyword_rejecting_slash_as_pattern_delimiter{
                    $last_nonblank_token} )
            {
                $op_expected = OPERATOR;
            }

# Patch to allow a ? following 'split' to be a depricated pattern
            # delimiter.  This patch is coordinated with the omission of split
            # from the list
            # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
            # will force perltidy to guess.
            elsif ($tok eq '?'
                && $last_nonblank_token eq 'split' )
            {
                $op_expected = UNKNOWN;
            }
        }
    } ## end type 'k'

# closing container token...

# Note that the actual token for type '}' may also be a ')'.

# Also note that $last_nonblank_token is not the token corresponding to
    # $last_nonblank_type when the type is a closing container.  In that
    # case it is the token before the corresponding opening container token.
    # So for example, for this snippet
    #       $a = do { BLOCK } / 2;
    # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.

elsif ( $last_nonblank_type eq '}' ) {
        $op_expected = UNKNOWN;

# handle something after 'do' and 'eval'
        if ( $is_block_operator{$last_nonblank_token} ) {

# something like $a = do { BLOCK } / 2;
            $op_expected = OPERATOR;    # block mode following }
        }

elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
            $op_expected = OPERATOR;
            if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }

}

# Check for smartmatch operator before preceding brace or square
        # bracket.  For example, at the ? after the ] in the following
        # expressions we are expecting an operator:
        #
        # qr/3/ ~~ ['1234'] ? 1 : 0;
        # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
        elsif ( $last_nonblank_token eq '~~' ) {
            $op_expected = OPERATOR;
        }

# A right brace here indicates the end of a simple block.  All
        # non-structural right braces have type 'R' all braces associated with
        # block operator keywords have been given those keywords as
        # "last_nonblank_token" and caught above.  (This statement is order
        # dependent, and must come after checking $last_nonblank_token).
        else {

# patch for dor.t (defined or).
            if (   $tok eq '/'
                && $next_type eq '/'
                && $last_nonblank_token eq ']' )
            {
                $op_expected = OPERATOR;
            }

# Patch for RT #116344: misparse a ternary operator after an
            # anonymous hash, like this:
            #   return ref {} ? 1 : 0;
            # The right brace should really be marked type 'R' in this case,
            # and it is safest to return an UNKNOWN here. Expecting a TERM will
            # cause the '?' to always be interpreted as a pattern delimiter
            # rather than introducing a ternary operator.
            elsif ( $tok eq '?' ) {
                $op_expected = UNKNOWN;
            }
            else {
                $op_expected = TERM;
            }
        }
    } ## end type '}'

# number or v-string...
    # An exception is for VERSION numbers a 'use' statement. It has the format
    #     use Module VERSION LIST
    # We could avoid this exception by writing a special sub to parse 'use'
    # statements and perhaps mark these numbers with a new type V (for VERSION)
    elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
        $op_expected = OPERATOR;
        if ( $statement_type eq 'use' ) {
            $op_expected = UNKNOWN;
        }
    }

# quote...
    # FIXME: labeled prototype words should probably be given type 'A' or maybe
    # 'J'; not 'q'; or maybe mark as type 'Y'
    elsif ( $last_nonblank_type eq 'q' ) {
        $op_expected = OPERATOR;
        if ( $last_nonblank_token eq 'prototype' )
          ##|| $last_nonblank_token eq 'switch' )
        {
            $op_expected = TERM;
        }
    }

# file handle or similar
    elsif ( $last_nonblank_type eq 'Z' ) {

$op_expected = UNKNOWN;

# angle.t
        if ( $last_nonblank_token =~ /^\w/ ) {
            $op_expected = UNKNOWN;
        }

# The 'weird parsing rules' of next section do not work for '<' and '?'
        # It is best to mark them as unknown.  Test case:
        #  print $fh <DATA>;
        elsif ( $tok =~ /^[\<\?]$/ ) {
            $op_expected = UNKNOWN;
        }

# For possible file handle like "$a", Perl uses weird parsing rules.
        # For example:
        # print $a/2,"/hi";   - division
        # print $a / 2,"/hi"; - division
        # print $a/ 2,"/hi";  - division
        # print $a /2,"/hi";  - pattern (and error)!
        # Some examples where this logic works okay, for '&','*','+':
        #    print $fh &xsi_protos(@mods);
        #    my $x = new $CompressClass *FH;
        #    print $OUT +( $count % 15 ? ", " : "\n\t" );
        elsif ($prev_type eq 'b'
            && $next_type ne 'b' )
        {
            $op_expected = TERM;
        }

# Note that '?' and '<' have been moved above
        # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
        elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {

# Do not complain in 'use' statements, which have special syntax.
            # For example, from RT#130344:
            #   use lib $FindBin::Bin . '/lib';
            if ( $statement_type ne 'use' ) {
                complain("operator in print statement not recommended\n");
            }
            $op_expected = OPERATOR;
        }
    }

# anything else...
    else {
        $op_expected = UNKNOWN;
    }

RETURN:

# debug and diagnostics can go here..

0 && do {
        print STDOUT
"EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
    };

return $op_expected;

} ## end of sub operator_expected

sub new_statement_ok {

# return true if the current token can start a new statement
    # USES GLOBAL VARIABLES: $last_nonblank_type

return label_ok()    # a label would be ok here

|| $last_nonblank_type eq 'J';    # or we follow a label

}

sub label_ok {

# Decide if a bare word followed by a colon here is a label
    # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
    # $brace_depth, @brace_type

# if it follows an opening or closing code block curly brace..
    if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
        && $last_nonblank_type eq $last_nonblank_token )
    {

# it is a label if and only if the curly encloses a code block
        return $brace_type[$brace_depth];
    }

# otherwise, it is a label if and only if it follows a ';' (real or fake)
    # or another label
    else {
        return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
    }
}

sub code_block_type {

# Decide if this is a block of code, and its type.
    # Must be called only when $type = $token = '{'
    # The problem is to distinguish between the start of a block of code
    # and the start of an anonymous hash reference
    # Returns "" if not code block, otherwise returns 'last_nonblank_token'
    # to indicate the type of code block.  (For example, 'last_nonblank_token'
    # might be 'if' for an if block, 'else' for an else block, etc).
    # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
    # $last_nonblank_block_type, $brace_depth, @brace_type

# handle case of multiple '{'s

# print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";

my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
    if (   $last_nonblank_token eq '{'
        && $last_nonblank_type eq $last_nonblank_token )
    {

# opening brace where a statement may appear is probably
        # a code block but might be and anonymous hash reference
        if ( $brace_type[$brace_depth] ) {
            return decide_if_code_block( $i, $rtokens, $rtoken_type,
                $max_token_index );
        }

# cannot start a code block within an anonymous hash
        else {
            return "";
        }
    }

elsif ( $last_nonblank_token eq ';' ) {

# an opening brace where a statement may appear is probably
        # a code block but might be and anonymous hash reference
        return decide_if_code_block( $i, $rtokens, $rtoken_type,
            $max_token_index );
    }

# handle case of '}{'
    elsif ($last_nonblank_token eq '}'
        && $last_nonblank_type eq $last_nonblank_token )
    {

# a } { situation ...
        # could be hash reference after code block..(blktype1.t)
        if ($last_nonblank_block_type) {
            return decide_if_code_block( $i, $rtokens, $rtoken_type,
                $max_token_index );
        }

# must be a block if it follows a closing hash reference
        else {
            return $last_nonblank_token;
        }
    }

################################################################
    # NOTE: braces after type characters start code blocks, but for
    # simplicity these are not identified as such.  See also
    # sub is_non_structural_brace.
    ################################################################

##    elsif ( $last_nonblank_type eq 't' ) {
##       return $last_nonblank_token;
##    }

# brace after label:
    elsif ( $last_nonblank_type eq 'J' ) {
        return $last_nonblank_token;
    }

# otherwise, look at previous token.  This must be a code block if
# it follows any of these:
# /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
    elsif ( $is_code_block_token{$last_nonblank_token} ) {

# Bug Patch: Note that the opening brace after the 'if' in the following
        # snippet is an anonymous hash ref and not a code block!
        #   print 'hi' if { x => 1, }->{x};
        # We can identify this situation because the last nonblank type
        # will be a keyword (instead of a closing peren)
        if (   $last_nonblank_token =~ /^(if|unless)$/
            && $last_nonblank_type eq 'k' )
        {
            return "";
        }
        else {
            return $last_nonblank_token;
        }
    }

# or a sub or package BLOCK
    elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
        && $last_nonblank_token =~ /^(sub|package)\b/ )
    {
        return $last_nonblank_token;
    }

# or a sub alias
    elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
        && ( $is_sub{$last_nonblank_token} ) )
    {
        return 'sub';
    }

elsif ( $statement_type =~ /^(sub|package)\b/ ) {
        return $statement_type;
    }

# user-defined subs with block parameters (like grep/map/eval)
    elsif ( $last_nonblank_type eq 'G' ) {
        return $last_nonblank_token;
    }

# check bareword
    elsif ( $last_nonblank_type eq 'w' ) {
        return decide_if_code_block( $i, $rtokens, $rtoken_type,
            $max_token_index );
    }

# Patch for bug # RT #94338 reported by Daniel Trizen
    # for-loop in a parenthesized block-map triggering an error message:
    #    map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
    # Check for a code block within a parenthesized function call
    elsif ( $last_nonblank_token eq '(' ) {
        my $paren_type = $paren_type[$paren_depth];
        if ( $paren_type && $paren_type =~ /^(map|grep|sort)$/ ) {

# We will mark this as a code block but use type 't' instead
            # of the name of the contining function.  This will allow for
            # correct parsing but will usually produce better formatting.
            # Braces with block type 't' are not broken open automatically
            # in the formatter as are other code block types, and this usually
            # works best.
            return 't';    # (Not $paren_type)
        }
        else {
            return "";
        }
    }

# handle unknown syntax ') {'
    # we previously appended a '()' to mark this case
    elsif ( $last_nonblank_token =~ /$/ ) {
        return $last_nonblank_token;
    }

# anything else must be anonymous hash reference
    else {
        return "";
    }
}

sub decide_if_code_block {

# USES GLOBAL VARIABLES: $last_nonblank_token
    my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;

my ( $next_nonblank_token, $i_next ) =
      find_next_nonblank_token( $i, $rtokens, $max_token_index );

# we are at a '{' where a statement may appear.
    # We must decide if this brace starts an anonymous hash or a code
    # block.
    # return "" if anonymous hash, and $last_nonblank_token otherwise

# initialize to be code BLOCK
    my $code_block_type = $last_nonblank_token;

# Check for the common case of an empty anonymous hash reference:
    # Maybe something like sub { { } }
    if ( $next_nonblank_token eq '}' ) {
        $code_block_type = "";
    }

else {

# To guess if this '{' is an anonymous hash reference, look ahead
        # and test as follows:
        #
        # it is a hash reference if next come:
        #   - a string or digit followed by a comma or =>
        #   - bareword followed by =>
        # otherwise it is a code block
        #
        # Examples of anonymous hash ref:
        # {'aa',};
        # {1,2}
        #
        # Examples of code blocks:
        # {1; print "hello\n", 1;}
        # {$a,1};

# We are only going to look ahead one more (nonblank/comment) line.
        # Strange formatting could cause a bad guess, but that's unlikely.
        my @pre_types;
        my @pre_tokens;

# Ignore the rest of this line if it is a side comment
        if ( $next_nonblank_token ne '#' ) {
            @pre_types  = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
            @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
        }
        my ( $rpre_tokens, $rpre_types ) =
          peek_ahead_for_n_nonblank_pre_tokens(20);    # 20 is arbitrary but
                                                       # generous, and prevents
                                                       # wasting lots of
                                                       # time in mangled files
        if ( defined($rpre_types) && @{$rpre_types} ) {
            push @pre_types,  @{$rpre_types};
            push @pre_tokens, @{$rpre_tokens};
        }

# put a sentinel token to simplify stopping the search
        push @pre_types, '}';
        push @pre_types, '}';

my $jbeg = 0;
        $jbeg = 1 if $pre_types[0] eq 'b';

# first look for one of these
        #  - bareword
        #  - bareword with leading -
        #  - digit
        #  - quoted string
        my $j = $jbeg;
        if ( $pre_types[$j] =~ /^[\'\"]/ ) {

# find the closing quote; don't worry about escapes
            my $quote_mark = $pre_types[$j];
            foreach my $k ( $j + 1 .. @pre_types - 2 ) {
                if ( $pre_types[$k] eq $quote_mark ) {
                    $j = $k + 1;
                    my $next = $pre_types[$j];
                    last;
                }
            }
        }
        elsif ( $pre_types[$j] eq 'd' ) {
            $j++;
        }
        elsif ( $pre_types[$j] eq 'w' ) {
            $j++;
        }
        elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
            $j++;
        }
        if ( $j > $jbeg ) {

$j++ if $pre_types[$j] eq 'b';

# Patched for RT #95708
            if (

# it is a comma which is not a pattern delimeter except for qw
                (
                       $pre_types[$j] eq ','
                    && $pre_tokens[$jbeg] !~ /^(s|m|y|tr|qr|q|qq|qx)$/
                )

# or a =>
                || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
              )
            {
                $code_block_type = "";
            }
        }
    }

return $code_block_type;
}

sub report_unexpected {

# report unexpected token type and show where it is
    # USES GLOBAL VARIABLES: $tokenizer_self
    my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
        $rpretoken_type, $input_line )
      = @_;

if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
        my $msg = "found $found where $expecting expected";
        my $pos = $rpretoken_map->[$i_tok];
        interrupt_logfile();
        my $input_line_number = $tokenizer_self->[_last_line_number_];
        my ( $offset, $numbered_line, $underline ) =
          make_numbered_line( $input_line_number, $input_line, $pos );
        $underline = write_on_underline( $underline, $pos - $offset, '^' );

my $trailer = "";
        if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
            my $pos_prev = $rpretoken_map->[$last_nonblank_i];
            my $num;
            if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
                $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
            }
            else {
                $num = $pos - $pos_prev;
            }
            if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }

$underline =
              write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
            $trailer = " (previous token underlined)";
        }
        $underline =~ s/\s+$//;
        warning( $numbered_line . "\n" );
        warning( $underline . "\n" );
        warning( $msg . $trailer . "\n" );
        resume_logfile();
    }
    return;
}

sub is_non_structural_brace {

# Decide if a brace or bracket is structural or non-structural
    # by looking at the previous token and type
    # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token

# EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
    # Tentatively deactivated because it caused the wrong operator expectation
    # for this code:
    #      $user = @vars[1] / 100;
    # Must update sub operator_expected before re-implementing.
    # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
    #    return 0;
    # }

################################################################
    # NOTE: braces after type characters start code blocks, but for
    # simplicity these are not identified as such.  See also
    # sub code_block_type
    ################################################################

##if ($last_nonblank_type eq 't') {return 0}

# otherwise, it is non-structural if it is decorated
    # by type information.
    # For example, the '{' here is non-structural:   ${xxx}
    return (
        $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/

# or if we follow a hash or array closing curly brace or bracket
          # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
          # because the first '}' would have been given type 'R'
          || $last_nonblank_type =~ /^([R\]])$/
    );
}

#########i#############################################################
# Tokenizer routines for tracking container nesting depths
#######################################################################

# The following routines keep track of nesting depths of the nesting
# types, ( [ { and ?.  This is necessary for determining the indentation
# level, and also for debugging programs.  Not only do they keep track of
# nesting depths of the individual brace types, but they check that each
# of the other brace types is balanced within matching pairs.  For
# example, if the program sees this sequence:
#
#         {  ( ( ) }
#
# then it can determine that there is an extra left paren somewhere
# between the { and the }.  And so on with every other possible
# combination of outer and inner brace types.  For another
# example:
#
#         ( [ ..... ]  ] )
#
# which has an extra ] within the parens.
#
# The brace types have indexes 0 .. 3 which are indexes into
# the matrices.
#
# The pair ? : are treated as just another nesting type, with ? acting
# as the opening brace and : acting as the closing brace.
#
# The matrix
#
#         $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
#
# saves the nesting depth of brace type $b (where $b is either of the other
# nesting types) when brace type $a enters a new depth.  When this depth
# decreases, a check is made that the current depth of brace types $b is
# unchanged, or otherwise there must have been an error.  This can
# be very useful for localizing errors, particularly when perl runs to
# the end of a large file (such as this one) and announces that there
# is a problem somewhere.
#
# A numerical sequence number is maintained for every nesting type,
# so that each matching pair can be uniquely identified in a simple
# way.

sub increase_nesting_depth {
    my ( $aa, $pos ) = @_;

# USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
    # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
    # $statement_type
    $current_depth[$aa]++;
    $total_depth++;
    $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
    my $input_line_number = $tokenizer_self->[_last_line_number_];
    my $input_line        = $tokenizer_self->[_line_of_text_];

# Sequence numbers increment by number of items.  This keeps
    # a unique set of numbers but still allows the relative location
    # of any type to be determined.
    $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
    my $seqno = $nesting_sequence_number[$aa];
    $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;

$starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
      [ $input_line_number, $input_line, $pos ];

for my $bb ( 0 .. @closing_brace_names - 1 ) {
        next if ( $bb == $aa );
        $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
    }

# set a flag for indenting a nested ternary statement
    my $indent = 0;
    if ( $aa == QUESTION_COLON ) {
        $nested_ternary_flag[ $current_depth[$aa] ] = 0;
        if ( $current_depth[$aa] > 1 ) {
            if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
                my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
                if ( $pdepth == $total_depth - 1 ) {
                    $indent = 1;
                    $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
                }
            }
        }
    }
    $nested_statement_type[$aa][ $current_depth[$aa] ] = $statement_type;
    $statement_type = "";
    return ( $seqno, $indent );
}

sub is_balanced_closing_container {

# Return true if a closing container can go here without error
    # Return false if not
    my ($aa) = @_;

# cannot close if there was no opening
    return unless ( $current_depth[$aa] > 0 );

# check that any other brace types $bb contained within would be balanced
    for my $bb ( 0 .. @closing_brace_names - 1 ) {
        next if ( $bb == $aa );
        return
          unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
            $current_depth[$bb] );
    }

# OK, everything will be balanced
    return 1;
}

sub decrease_nesting_depth {

my ( $aa, $pos ) = @_;

# USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
    # @current_sequence_number, @depth_array, @starting_line_of_current_depth
    # $statement_type
    my $seqno             = 0;
    my $input_line_number = $tokenizer_self->[_last_line_number_];
    my $input_line        = $tokenizer_self->[_line_of_text_];

my $outdent = 0;
    $total_depth--;
    if ( $current_depth[$aa] > 0 ) {

# set a flag for un-indenting after seeing a nested ternary statement
        $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
        if ( $aa == QUESTION_COLON ) {
            $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
        }
        $statement_type = $nested_statement_type[$aa][ $current_depth[$aa] ];

# check that any brace types $bb contained within are balanced
        for my $bb ( 0 .. @closing_brace_names - 1 ) {
            next if ( $bb == $aa );

unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
                $current_depth[$bb] )
            {
                my $diff =
                  $current_depth[$bb] -
                  $depth_array[$aa][$bb][ $current_depth[$aa] ];

# don't whine too many times
                my $saw_brace_error = get_saw_brace_error();
                if (
                    $saw_brace_error <= MAX_NAG_MESSAGES

# if too many closing types have occurred, we probably
                    # already caught this error
                    && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
                  )
                {
                    interrupt_logfile();
                    my $rsl =
                      $starting_line_of_current_depth[$aa]
                      [ $current_depth[$aa] ];
                    my $sl  = $rsl->[0];
                    my $rel = [ $input_line_number, $input_line, $pos ];
                    my $el  = $rel->[0];
                    my ($ess);

if ( $diff == 1 || $diff == -1 ) {
                        $ess = '';
                    }
                    else {
                        $ess = 's';
                    }
                    my $bname =
                      ( $diff > 0 )
                      ? $opening_brace_names[$bb]
                      : $closing_brace_names[$bb];
                    write_error_indicator_pair( @{$rsl}, '^' );
                    my $msg = <<"EOM";
Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
EOM

if ( $diff > 0 ) {
                        my $rml =
                          $starting_line_of_current_depth[$bb]
                          [ $current_depth[$bb] ];
                        my $ml = $rml->[0];
                        $msg .=
"    The most recent un-matched $bname is on line $ml\n";
                        write_error_indicator_pair( @{$rml}, '^' );
                    }
                    write_error_indicator_pair( @{$rel}, '^' );
                    warning($msg);
                    resume_logfile();
                }
                increment_brace_error();
            }
        }
        $current_depth[$aa]--;
    }
    else {

my $saw_brace_error = get_saw_brace_error();
        if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
            my $msg = <<"EOM";
There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
EOM
            indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
        }
        increment_brace_error();

# keep track of errors in braces alone (ignoring ternary nesting errors)
        $tokenizer_self->[_true_brace_error_count_]++
          if ( $closing_brace_names[$aa] ne "':'" );
    }
    return ( $seqno, $outdent );
}

sub check_final_nesting_depths {

# USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth

for my $aa ( 0 .. @closing_brace_names - 1 ) {

if ( $current_depth[$aa] ) {
            my $rsl =
              $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
            my $sl  = $rsl->[0];
            my $msg = <<"EOM";
Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
The most recent un-matched $opening_brace_names[$aa] is on line $sl
EOM
            indicate_error( $msg, @{$rsl}, '^' );
            increment_brace_error();
        }
    }
    return;
}

#########i#############################################################
# Tokenizer routines for looking ahead in input stream
#######################################################################

sub peek_ahead_for_n_nonblank_pre_tokens {

# returns next n pretokens if they exist
    # returns undef's if hits eof without seeing any pretokens
    # USES GLOBAL VARIABLES: $tokenizer_self
    my $max_pretokens = shift;
    my $line;
    my $i = 0;
    my ( $rpre_tokens, $rmap, $rpre_types );

while ( $line =
        $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
    {
        $line =~ s/^\s*//;                 # trim leading blanks
        next if ( length($line) <= 0 );    # skip blank
        next if ( $line =~ /^#/ );         # skip comment
        ( $rpre_tokens, $rmap, $rpre_types ) =
          pre_tokenize( $line, $max_pretokens );
        last;
    }
    return ( $rpre_tokens, $rpre_types );
}

# look ahead for next non-blank, non-comment line of code
sub peek_ahead_for_nonblank_token {

# USES GLOBAL VARIABLES: $tokenizer_self
    my ( $rtokens, $max_token_index ) = @_;
    my $line;
    my $i = 0;

while ( $line =
        $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
    {
        $line =~ s/^\s*//;                 # trim leading blanks
        next if ( length($line) <= 0 );    # skip blank
        next if ( $line =~ /^#/ );         # skip comment
        my ( $rtok, $rmap, $rtype ) =
          pre_tokenize( $line, 2 );        # only need 2 pre-tokens
        my $j = $max_token_index + 1;

foreach my $tok ( @{$rtok} ) {
            last if ( $tok =~ "\n" );
            $rtokens->[ ++$j ] = $tok;
        }
        last;
    }
    return $rtokens;
}

#########i#############################################################
# Tokenizer guessing routines for ambiguous situations
#######################################################################

sub guess_if_pattern_or_conditional {

# this routine is called when we have encountered a ? following an
    # unknown bareword, and we must decide if it starts a pattern or not
    # input parameters:
    #   $i - token index of the ? starting possible pattern
    # output parameters:
    #   $is_pattern = 0 if probably not pattern,  =1 if probably a pattern
    #   msg = a warning or diagnostic message
    # USES GLOBAL VARIABLES: $last_nonblank_token

# FIXME: this needs to be rewritten

my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
    my $is_pattern = 0;
    my $msg        = "guessing that ? after $last_nonblank_token starts a ";

if ( $i >= $max_token_index ) {
        $msg .= "conditional (no end to pattern found on the line)\n";
    }
    else {
        my $ibeg = $i;
        $i = $ibeg + 1;
        my $next_token = $rtokens->[$i];    # first token after ?

# look for a possible ending ? on this line..
        my $in_quote        = 1;
        my $quote_depth     = 0;
        my $quote_character = '';
        my $quote_pos       = 0;
        my $quoted_string;
        (
            $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
            $quoted_string
          )
          = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
            $quote_pos, $quote_depth, $max_token_index );

if ($in_quote) {

# we didn't find an ending ? on this line,
            # so we bias towards conditional
            $is_pattern = 0;
            $msg .= "conditional (no ending ? on this line)\n";

# we found an ending ?, so we bias towards a pattern
        }
        else {

# Watch out for an ending ? in quotes, like this
            #    my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
            my $s_quote = 0;
            my $d_quote = 0;
            my $colons  = 0;
            foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
                my $tok = $rtokens->[$ii];
                if ( $tok eq ":" ) { $colons++ }
                if ( $tok eq "'" ) { $s_quote++ }
                if ( $tok eq '"' ) { $d_quote++ }
            }
            if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
                $is_pattern = 0;
                $msg .= "found ending ? but unbalanced quote chars\n";
            }
            elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
                $is_pattern = 1;
                $msg .= "pattern (found ending ? and pattern expected)\n";
            }
            else {
                $msg .= "pattern (uncertain, but found ending ?)\n";
            }
        }
    }
    return ( $is_pattern, $msg );
}

sub guess_if_pattern_or_division {

# this routine is called when we have encountered a / following an
    # unknown bareword, and we must decide if it starts a pattern or is a
    # division
    # input parameters:
    #   $i - token index of the / starting possible pattern
    # output parameters:
    #   $is_pattern = 0 if probably division,  =1 if probably a pattern
    #   msg = a warning or diagnostic message
    # USES GLOBAL VARIABLES: $last_nonblank_token
    my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
    my $is_pattern = 0;
    my $msg        = "guessing that / after $last_nonblank_token starts a ";

if ( $i >= $max_token_index ) {
        $msg .= "division (no end to pattern found on the line)\n";
    }
    else {
        my $ibeg = $i;
        my $divide_expected =
          numerator_expected( $i, $rtokens, $max_token_index );
        $i = $ibeg + 1;
        my $next_token = $rtokens->[$i];    # first token after slash

# One of the things we can look at is the spacing around the slash.
        # There # are four possible spacings around the first slash:
        #
        #     return pi/two;#/;     -/-
        #     return pi/ two;#/;    -/+
        #     return pi / two;#/;   +/+
        #     return pi /two;#/;    +/-   <-- possible pattern
        #
        # Spacing rule: a space before the slash but not after the slash
        # usually indicates a pattern.  We can use this to break ties.

my $is_pattern_by_spacing =
          ( $i > 1 && $next_token ne ' ' && $rtokens->[ $i - 2 ] eq ' ' );

# look for a possible ending / on this line..
        my $in_quote        = 1;
        my $quote_depth     = 0;
        my $quote_character = '';
        my $quote_pos       = 0;
        my $quoted_string;
        (
            $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
            $quoted_string
          )
          = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
            $quote_pos, $quote_depth, $max_token_index );

if ($in_quote) {

# we didn't find an ending / on this line, so we bias towards
            # division
            if ( $divide_expected >= 0 ) {
                $is_pattern = 0;
                $msg .= "division (no ending / on this line)\n";
            }
            else {

# assuming a multi-line pattern ... this is risky, but division
                # does not seem possible.  If this fails, it would either be due
                # to a syntax error in the code, or the division_expected logic
                # needs to be fixed.
                $msg        = "multi-line pattern (division not possible)\n";
                $is_pattern = 1;
            }
        }

# we found an ending /, so we bias slightly towards a pattern
        else {

my $pattern_expected =
              pattern_expected( $i, $rtokens, $max_token_index );

if ( $pattern_expected >= 0 ) {

# pattern looks possible...
                if ( $divide_expected >= 0 ) {

# Both pattern and divide can work here...

# A very common bare word in math expressions is 'pi'
                    if ( $last_nonblank_token eq 'pi' ) {
                        $msg .= "division (pattern works too but saw 'pi')\n";
                        $is_pattern = 0;
                    }

# A very common bare word in pattern expressions is 'ok'
                    elsif ( $last_nonblank_token eq 'ok' ) {
                        $msg .= "pattern (division works too but saw 'ok')\n";
                        $is_pattern = 1;
                    }

# If one rule is more definite, use it
                    elsif ( $divide_expected > $pattern_expected ) {
                        $msg .=
                          "division (more likely based on following tokens)\n";
                        $is_pattern = 0;
                    }

# otherwise, use the spacing rule
                    elsif ($is_pattern_by_spacing) {
                        $msg .=
"pattern (guess on spacing, but division possible too)\n";
                        $is_pattern = 1;
                    }
                    else {
                        $msg .=
"division (guess on spacing, but pattern is possible too)\n";
                        $is_pattern = 0;
                    }
                }

# divide_expected < 0 means divide can not work here
                else {
                    $is_pattern = 1;
                    $msg .= "pattern (division not possible)\n";
                }
            }

# pattern does not look possible...
            else {

if ( $divide_expected >= 0 ) {
                    $is_pattern = 0;
                    $msg .= "division (pattern not possible)\n";
                }

# Neither pattern nor divide look possible...go by spacing
                else {
                    if ($is_pattern_by_spacing) {
                        $msg .= "pattern (guess on spacing)\n";
                        $is_pattern = 1;
                    }
                    else {
                        $msg .= "division (guess on spacing)\n";
                        $is_pattern = 0;
                    }
                }
            }
        }
    }
    return ( $is_pattern, $msg );
}

# try to resolve here-doc vs. shift by looking ahead for
# non-code or the end token (currently only looks for end token)
# returns 1 if it is probably a here doc, 0 if not
sub guess_if_here_doc {

# This is how many lines we will search for a target as part of the
    # guessing strategy.  It is a constant because there is probably
    # little reason to change it.
    # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
    # %is_constant,
    my $HERE_DOC_WINDOW = 40;

my $next_token        = shift;
    my $here_doc_expected = 0;
    my $line;
    my $k   = 0;
    my $msg = "checking <<";

while ( $line =
        $tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) )
    {
        chomp $line;

if ( $line =~ /^$next_token$/ ) {
            $msg .= " -- found target $next_token ahead $k lines\n";
            $here_doc_expected = 1;    # got it
            last;
        }
        last if ( $k >= $HERE_DOC_WINDOW );
    }

unless ($here_doc_expected) {

if ( !defined($line) ) {
            $here_doc_expected = -1;    # hit eof without seeing target
            $msg .= " -- must be shift; target $next_token not in file\n";

}
        else {                          # still unsure..taking a wild guess

if ( !$is_constant{$current_package}{$next_token} ) {
                $here_doc_expected = 1;
                $msg .=
                  " -- guessing it's a here-doc ($next_token not a constant)\n";
            }
            else {
                $msg .=
                  " -- guessing it's a shift ($next_token is a constant)\n";
            }
        }
    }
    write_logfile_entry($msg);
    return $here_doc_expected;
}

#########i#############################################################
# Tokenizer Routines for scanning identifiers and related items
#######################################################################

sub scan_bare_identifier_do {

# this routine is called to scan a token starting with an alphanumeric
    # variable or package separator, :: or '.
    # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
    # $last_nonblank_type,@paren_type, $paren_depth

my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
        $max_token_index )
      = @_;
    my $i_begin = $i;
    my $package = undef;

my $i_beg = $i;

# we have to back up one pretoken at a :: since each : is one pretoken
    if ( $tok eq '::' ) { $i_beg-- }
    if ( $tok eq '->' ) { $i_beg-- }
    my $pos_beg = $rtoken_map->[$i_beg];
    pos($input_line) = $pos_beg;

#  Examples:
    #   A::B::C
    #   A::
    #   ::A
    #   A'B
    if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {

my $pos  = pos($input_line);
        my $numc = $pos - $pos_beg;
        $tok = substr( $input_line, $pos_beg, $numc );

# type 'w' includes anything without leading type info
        # ($,%,@,*) including something like abc::def::ghi
        $type = 'w';

my $sub_name = "";
        if ( defined($2) ) { $sub_name = $2; }
        if ( defined($1) ) {
            $package = $1;

# patch: don't allow isolated package name which just ends
            # in the old style package separator (single quote).  Example:
            #   use CGI':all';
            if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
                $pos--;
            }

$package =~ s/\'/::/g;
            if ( $package =~ /^\:/ ) { $package = 'main' . $package }
            $package =~ s/::$//;
        }
        else {
            $package = $current_package;

if ( $is_keyword{$tok} ) {
                $type = 'k';
            }
        }

# if it is a bareword..
        if ( $type eq 'w' ) {

# check for v-string with leading 'v' type character
            # (This seems to have precedence over filehandle, type 'Y')
            if ( $tok =~ /^v\d[_\d]*$/ ) {

# we only have the first part - something like 'v101' -
                # look for more
                if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
                    $pos  = pos($input_line);
                    $numc = $pos - $pos_beg;
                    $tok  = substr( $input_line, $pos_beg, $numc );
                }
                $type = 'v';

# warn if this version can't handle v-strings
                report_v_string($tok);
            }

elsif ( $is_constant{$package}{$sub_name} ) {
                $type = 'C';
            }

# bareword after sort has implied empty prototype; for example:
            # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
            # This has priority over whatever the user has specified.
            elsif ($last_nonblank_token eq 'sort'
                && $last_nonblank_type eq 'k' )
            {
                $type = 'Z';
            }

# Note: strangely, perl does not seem to really let you create
            # functions which act like eval and do, in the sense that eval
            # and do may have operators following the final }, but any operators
            # that you create with prototype (&) apparently do not allow
            # trailing operators, only terms.  This seems strange.
            # If this ever changes, here is the update
            # to make perltidy behave accordingly:

# elsif ( $is_block_function{$package}{$tok} ) {
            #    $tok='eval'; # patch to do braces like eval  - doesn't work
            #    $type = 'k';
            #}
            # FIXME: This could become a separate type to allow for different
            # future behavior:
            elsif ( $is_block_function{$package}{$sub_name} ) {
                $type = 'G';
            }
            elsif ( $is_block_list_function{$package}{$sub_name} ) {
                $type = 'G';
            }
            elsif ( $is_user_function{$package}{$sub_name} ) {
                $type      = 'U';
                $prototype = $user_function_prototype{$package}{$sub_name};
            }

# check for indirect object
            elsif (

# added 2001-03-27: must not be followed immediately by '('
                # see fhandle.t
                ( $input_line !~ m/\G\(/gc )

# and
                && (

# preceded by keyword like 'print', 'printf' and friends
                    $is_indirect_object_taker{$last_nonblank_token}

# or preceded by something like 'print(' or 'printf('
                    || (
                        ( $last_nonblank_token eq '(' )
                        && $is_indirect_object_taker{ $paren_type[$paren_depth]
                        }

)
                )
              )
            {

# may not be indirect object unless followed by a space
                if ( $input_line =~ m/\G\s+/gc ) {
                    $type = 'Y';

# Abandon Hope ...
                    # Perl's indirect object notation is a very bad
                    # thing and can cause subtle bugs, especially for
                    # beginning programmers.  And I haven't even been
                    # able to figure out a sane warning scheme which
                    # doesn't get in the way of good scripts.

# Complain if a filehandle has any lower case
                    # letters.  This is suggested good practice.
                    # Use 'sub_name' because something like
                    # main::MYHANDLE is ok for filehandle
                    if ( $sub_name =~ /[a-z]/ ) {

# could be bug caused by older perltidy if
                        # followed by '('
                        if ( $input_line =~ m/\G\s*\(/gc ) {
                            complain(
"Caution: unknown word '$tok' in indirect object slot\n"
                            );
                        }
                    }
                }

# bareword not followed by a space -- may not be filehandle
                # (may be function call defined in a 'use' statement)
                else {
                    $type = 'Z';
                }
            }
        }

# Now we must convert back from character position
        # to pre_token index.
        # I don't think an error flag can occur here ..but who knows
        my $error;
        ( $i, $error ) =
          inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
        if ($error) {
            warning("scan_bare_identifier: Possibly invalid tokenization\n");
        }
    }

# no match but line not blank - could be syntax error
    # perl will take '::' alone without complaint
    else {
        $type = 'w';

# change this warning to log message if it becomes annoying
        warning("didn't find identifier after leading ::\n");
    }
    return ( $i, $tok, $type, $prototype );
}

sub scan_id_do {

# This is the new scanner and will eventually replace scan_identifier.
# Only type 'sub' and 'package' are implemented.
# Token types $ * % @ & -> are not yet implemented.
#
# Scan identifier following a type token.
# The type of call depends on $id_scan_state: $id_scan_state = ''
# for starting call, in which case $tok must be the token defining
# the type.
#
# If the type token is the last nonblank token on the line, a value
# of $id_scan_state = $tok is returned, indicating that further
# calls must be made to get the identifier.  If the type token is
# not the last nonblank token on the line, the identifier is
# scanned and handled and a value of '' is returned.
# USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
# $statement_type, $tokenizer_self

my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
        $max_token_index )
      = @_;
    use constant DEBUG_NSCAN => 0;
    my $type = '';
    my ( $i_beg, $pos_beg );

#print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
    #my ($a,$b,$c) = caller;
    #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";

# on re-entry, start scanning at first token on the line
    if ($id_scan_state) {
        $i_beg = $i;
        $type  = '';
    }

# on initial entry, start scanning just after type token
    else {
        $i_beg         = $i + 1;
        $id_scan_state = $tok;
        $type          = 't';
    }

# find $i_beg = index of next nonblank token,
    # and handle empty lines
    my $blank_line          = 0;
    my $next_nonblank_token = $rtokens->[$i_beg];
    if ( $i_beg > $max_token_index ) {
        $blank_line = 1;
    }
    else {

# only a '#' immediately after a '$' is not a comment
        if ( $next_nonblank_token eq '#' ) {
            unless ( $tok eq '$' ) {
                $blank_line = 1;
            }
        }

if ( $next_nonblank_token =~ /^\s/ ) {
            ( $next_nonblank_token, $i_beg ) =
              find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
                $max_token_index );
            if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
                $blank_line = 1;
            }
        }
    }

# handle non-blank line; identifier, if any, must follow
    unless ($blank_line) {

if ( $is_sub{$id_scan_state} ) {
            ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
                {
                    input_line      => $input_line,
                    i               => $i,
                    i_beg           => $i_beg,
                    tok             => $tok,
                    type            => $type,
                    rtokens         => $rtokens,
                    rtoken_map      => $rtoken_map,
                    id_scan_state   => $id_scan_state,
                    max_token_index => $max_token_index
                }
            );
        }

elsif ( $is_package{$id_scan_state} ) {
            ( $i, $tok, $type ) =
              do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
                $rtoken_map, $max_token_index );
            $id_scan_state = '';
        }

else {
            warning("invalid token in scan_id: $tok\n");
            $id_scan_state = '';
        }
    }

if ( $id_scan_state && ( !defined($type) || !$type ) ) {

# shouldn't happen:
        warning(
"Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
        );
        report_definite_bug();
    }

DEBUG_NSCAN && do {
        print STDOUT
          "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
    };
    return ( $i, $tok, $type, $id_scan_state );
}

sub check_prototype {
    my ( $proto, $package, $subname ) = @_;
    return unless ( defined($package) && defined($subname) );
    if ( defined($proto) ) {
        $proto =~ s/^\s*$\s*//;
        $proto =~ s/\s*$$//;
        if ($proto) {
            $is_user_function{$package}{$subname}        = 1;
            $user_function_prototype{$package}{$subname} = "($proto)";

# prototypes containing '&' must be treated specially..
            if ( $proto =~ /\&/ ) {

# right curly braces of prototypes ending in
                # '&' may be followed by an operator
                if ( $proto =~ /\&$/ ) {
                    $is_block_function{$package}{$subname} = 1;
                }

# right curly braces of prototypes NOT ending in
                # '&' may NOT be followed by an operator
                elsif ( $proto !~ /\&$/ ) {
                    $is_block_list_function{$package}{$subname} = 1;
                }
            }
        }
        else {
            $is_constant{$package}{$subname} = 1;
        }
    }
    else {
        $is_user_function{$package}{$subname} = 1;
    }
    return;
}

sub do_scan_package {

# do_scan_package parses a package name
    # it is called with $i_beg equal to the index of the first nonblank
    # token following a 'package' token.
    # USES GLOBAL VARIABLES: $current_package,

# package NAMESPACE
    # package NAMESPACE VERSION
    # package NAMESPACE BLOCK
    # package NAMESPACE VERSION BLOCK
    #
    # If VERSION is provided, package sets the $VERSION variable in the given
    # namespace to a version object with the VERSION provided. VERSION must be
    # a "strict" style version number as defined by the version module: a
    # positive decimal number (integer or decimal-fraction) without
    # exponentiation or else a dotted-decimal v-string with a leading 'v'
    # character and at least three components.
    # reference http://perldoc.perl.org/functions/package.html

my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
        $max_token_index )
      = @_;
    my $package = undef;
    my $pos_beg = $rtoken_map->[$i_beg];
    pos($input_line) = $pos_beg;

# handle non-blank line; package name, if any, must follow
    if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
        $package = $1;
        $package = ( defined($1) && $1 ) ? $1 : 'main';
        $package =~ s/\'/::/g;
        if ( $package =~ /^\:/ ) { $package = 'main' . $package }
        $package =~ s/::$//;
        my $pos  = pos($input_line);
        my $numc = $pos - $pos_beg;
        $tok  = 'package ' . substr( $input_line, $pos_beg, $numc );
        $type = 'i';

# Now we must convert back from character position
        # to pre_token index.
        # I don't think an error flag can occur here ..but ?
        my $error;
        ( $i, $error ) =
          inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
        if ($error) { warning("Possibly invalid package\n") }
        $current_package = $package;

# we should now have package NAMESPACE
        # now expecting VERSION, BLOCK, or ; to follow ...
        # package NAMESPACE VERSION
        # package NAMESPACE BLOCK
        # package NAMESPACE VERSION BLOCK
        my ( $next_nonblank_token, $i_next ) =
          find_next_nonblank_token( $i, $rtokens, $max_token_index );

# check that something recognizable follows, but do not parse.
        # A VERSION number will be parsed later as a number or v-string in the
        # normal way.  What is important is to set the statement type if
        # everything looks okay so that the operator_expected() routine
        # knows that the number is in a package statement.
        # Examples of valid primitive tokens that might follow are:
        #  1235  . ; { } v3  v
        if ( $next_nonblank_token =~ /^([v\.\d;\{\}])|v\d|\d+$/ ) {
            $statement_type = $tok;
        }
        else {
            warning(
                "Unexpected '$next_nonblank_token' after package name '$tok'\n"
            );
        }
    }

# no match but line not blank --
    # could be a label with name package, like package:  , for example.
    else {
        $type = 'k';
    }

return ( $i, $tok, $type );
}

sub scan_identifier_do {

# This routine assembles tokens into identifiers.  It maintains a
    # scan state, id_scan_state.  It updates id_scan_state based upon
    # current id_scan_state and token, and returns an updated
    # id_scan_state and the next index after the identifier.

# USES GLOBAL VARIABLES: $context, $last_nonblank_token,
    # $last_nonblank_type

my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
        $expecting, $container_type )
      = @_;
    use constant DEBUG_SCAN_ID => 0;
    my $i_begin   = $i;
    my $type      = '';
    my $tok_begin = $rtokens->[$i_begin];
    if ( $tok_begin eq ':' ) { $tok_begin = '::' }
    my $id_scan_state_begin = $id_scan_state;
    my $identifier_begin    = $identifier;
    my $tok                 = $tok_begin;
    my $message             = "";
    my $tok_is_blank;    # a flag to speed things up

my $in_prototype_or_signature =
      $container_type && $container_type =~ /^sub\b/;

# these flags will be used to help figure out the type:
    my $saw_alpha;
    my $saw_type;

# allow old package separator (') except in 'use' statement
    my $allow_tick = ( $last_nonblank_token ne 'use' );

#########################################################
    # get started by defining a type and a state if necessary
    #########################################################

if ( !$id_scan_state ) {
        $context = UNKNOWN_CONTEXT;

# fixup for digraph
        if ( $tok eq '>' ) {
            $tok       = '->';
            $tok_begin = $tok;
        }
        $identifier = $tok;

if ( $tok eq '$' || $tok eq '*' ) {
            $id_scan_state = '$';
            $context       = SCALAR_CONTEXT;
        }
        elsif ( $tok eq '%' || $tok eq '@' ) {
            $id_scan_state = '$';
            $context       = LIST_CONTEXT;
        }
        elsif ( $tok eq '&' ) {
            $id_scan_state = '&';
        }
        elsif ( $tok eq 'sub' or $tok eq 'package' ) {
            $saw_alpha     = 0;     # 'sub' is considered type info here
            $id_scan_state = '$';
            $identifier .= ' ';     # need a space to separate sub from sub name
        }
        elsif ( $tok eq '::' ) {
            $id_scan_state = 'A';
        }
        elsif ( $tok =~ /^\w/ ) {
            $id_scan_state = ':';
            $saw_alpha     = 1;
        }
        elsif ( $tok eq '->' ) {
            $id_scan_state = '$';
        }
        else {

# shouldn't happen
            my ( $a, $b, $c ) = caller;
            warning("Program Bug: scan_identifier given bad token = $tok \n");
            warning("   called from sub $a  line: $c\n");
            report_definite_bug();
        }
        $saw_type = !$saw_alpha;
    }
    else {
        $i--;
        $saw_alpha = ( $tok =~ /^\w/ );
        $saw_type  = ( $tok =~ /([\$\%\@\*\&])/ );
    }

###############################
    # loop to gather the identifier
    ###############################

my $i_save = $i;

while ( $i < $max_token_index ) {
        my $last_tok_is_blank = $tok_is_blank;
        if   ($tok_is_blank) { $tok_is_blank = undef }
        else                 { $i_save       = $i }

$tok = $rtokens->[ ++$i ];

# patch to make digraph :: if necessary
        if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
            $tok = '::';
            $i++;
        }

########################
        # Starting variable name
        ########################

if ( $id_scan_state eq '$' ) {

if ( $tok eq '$' ) {

$identifier .= $tok;

# we've got a punctuation variable if end of line (punct.t)
                if ( $i == $max_token_index ) {
                    $type          = 'i';
                    $id_scan_state = '';
                    last;
                }
            }
            elsif ( $tok =~ /^\w/ ) {    # alphanumeric ..
                $saw_alpha     = 1;
                $id_scan_state = ':';    # now need ::
                $identifier .= $tok;
            }
            elsif ( $tok eq '::' ) {
                $id_scan_state = 'A';
                $identifier .= $tok;
            }

# POSTDEFREF ->@ ->% ->& ->*
            elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
                $identifier .= $tok;
            }
            elsif ( $tok eq "'" && $allow_tick ) {    # alphanumeric ..
                $saw_alpha     = 1;
                $id_scan_state = ':';                 # now need ::
                $identifier .= $tok;

# Perl will accept leading digits in identifiers,
                # although they may not always produce useful results.
                # Something like $main::0 is ok.  But this also works:
                #
                #  sub howdy::123::bubba{ print "bubba $54321!\n" }
                #  howdy::123::bubba();
                #
            }

# $# and POSTDEFREF ->$#
            elsif (
                   ( $tok eq '#' )
                && ( $identifier =~ /\$$/ )

# a # inside a prototype or signature can only start a comment
                && !$in_prototype_or_signature
              )
            {
                # A '#' starts a comment if it follows a space. For example,
                # the following is equivalent to $ans=40.
                #   my $ #
                #     ans = 40;
                if ($last_tok_is_blank) {
                    $type = 'i';
                    if ( $id_scan_state eq '$' ) { $type = 't' }
                    $i             = $i_save;
                    $id_scan_state = '';
                    last;
                }

# May be '$#' or '$#array'
                $identifier .= $tok;    # keep same state, a $ could follow
            }

elsif ( $tok eq '{' ) {

# check for something like ${#} or ${�}
                if (
                    (
                           $identifier eq '$'
                        || $identifier eq '@'
                        || $identifier eq '$#'
                    )
                    && $i + 2 <= $max_token_index
                    && $rtokens->[ $i + 2 ] eq '}'
                    && $rtokens->[ $i + 1 ] !~ /[\s\w]/
                  )
                {
                    my $next2 = $rtokens->[ $i + 2 ];
                    my $next1 = $rtokens->[ $i + 1 ];
                    $identifier .= $tok . $next1 . $next2;
                    $i += 2;
                    $id_scan_state = '';
                    last;
                }

# skip something like ${xxx} or ->{
                $id_scan_state = '';

# if this is the first token of a line, any tokens for this
                # identifier have already been accumulated
                if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; }
                $i = $i_save;
                last;
            }

# space ok after leading $ % * & @
            elsif ( $tok =~ /^\s*$/ ) {

$tok_is_blank = 1;

if ( $identifier =~ /^[\$\%\*\&\@]/ ) {

if ( length($identifier) > 1 ) {
                        $id_scan_state = '';
                        $i             = $i_save;
                        $type          = 'i';    # probably punctuation variable
                        last;
                    }
                    else {

# spaces after $'s are common, and space after @
                        # is harmless, so only complain about space
                        # after other type characters. Space after $ and
                        # @ will be removed in formatting.  Report space
                        # after % and * because they might indicate a
                        # parsing error.  In other words '% ' might be a
                        # modulo operator.  Delete this warning if it
                        # gets annoying.
                        if ( $identifier !~ /^[\@\$]$/ ) {
                            $message =
                              "Space in identifier, following $identifier\n";
                        }
                    }
                }

# else:
                # space after '->' is ok
            }
            elsif ( $tok eq '^' ) {

# check for some special variables like $^W
                if ( $identifier =~ /^[\$\*\@\%]$/ ) {
                    $identifier .= $tok;
                    $id_scan_state = 'A';

# Perl accepts '$^]' or '@^]', but
                    # there must not be a space before the ']'.
                    my $next1 = $rtokens->[ $i + 1 ];
                    if ( $next1 eq ']' ) {
                        $i++;
                        $identifier .= $next1;
                        $id_scan_state = "";
                        last;
                    }
                }
                else {
                    $id_scan_state = '';
                }
            }
            else {    # something else

if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {

# We might be in an extrusion of
                    #     sub foo2 ( $first, $, $third ) {
                    # looking at a line starting with a comma, like
                    #   $
                    #   ,
                    # in this case the comma ends the signature variable
                    # '$' which will have been previously marked type 't'
                    # rather than 'i'.
                    if ( $i == $i_begin ) {
                        $identifier = "";
                        $type       = "";
                    }

# at a # we have to mark as type 't' because more may
                    # follow, otherwise, in a signature we can let '$' be an
                    # identifier here for better formatting.
                    # See 'mangle4.in' for a test case.
                    else {
                        $type = 'i';
                        if ( $id_scan_state eq '$' && $tok eq '#' ) {
                            $type = 't';
                        }
                        $i = $i_save;
                    }
                    $id_scan_state = '';
                    last;
                }

# check for various punctuation variables
                if ( $identifier =~ /^[\$\*\@\%]$/ ) {
                    $identifier .= $tok;
                }

# POSTDEFREF: Postfix reference ->$* ->%*  ->@* ->** ->&* ->$#*
                elsif ($tok eq '*'
                    && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
                {
                    $identifier .= $tok;
                }

elsif ( $identifier eq '$#' ) {

if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }

# perl seems to allow just these: $#: $#- $#+
                    elsif ( $tok =~ /^[\:\-\+]$/ ) {
                        $type = 'i';
                        $identifier .= $tok;
                    }
                    else {
                        $i = $i_save;
                        write_logfile_entry( 'Use of $# is deprecated' . "\n" );
                    }
                }
                elsif ( $identifier eq '$$' ) {

# perl does not allow references to punctuation
                    # variables without braces.  For example, this
                    # won't work:
                    #  $:=\4;
                    #  $a = $$:;
                    # You would have to use
                    #  $a = ${$:};

# '$$' alone is punctuation variable for PID
                    $i = $i_save;
                    if   ( $tok eq '{' ) { $type = 't' }
                    else                 { $type = 'i' }
                }
                elsif ( $identifier eq '->' ) {
                    $i = $i_save;
                }
                else {
                    $i = $i_save;
                    if ( length($identifier) == 1 ) { $identifier = ''; }
                }
                $id_scan_state = '';
                last;
            }
        }

###################################
        # looking for alphanumeric after ::
        ###################################

elsif ( $id_scan_state eq 'A' ) {

$tok_is_blank = $tok =~ /^\s*$/;

if ( $tok =~ /^\w/ ) {    # found it
                $identifier .= $tok;
                $id_scan_state = ':';    # now need ::
                $saw_alpha     = 1;
            }
            elsif ( $tok eq "'" && $allow_tick ) {
                $identifier .= $tok;
                $id_scan_state = ':';    # now need ::
                $saw_alpha     = 1;
            }
            elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
                $id_scan_state = '(';
                $identifier .= $tok;
            }
            elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
                $id_scan_state = ')';
                $identifier .= $tok;
            }
            else {
                $id_scan_state = '';
                $i             = $i_save;
                last;
            }
        }

###################################
        # looking for :: after alphanumeric
        ###################################

elsif ( $id_scan_state eq ':' ) {    # looking for :: after alpha

$tok_is_blank = $tok =~ /^\s*$/;

if ( $tok eq '::' ) {            # got it
                $identifier .= $tok;
                $id_scan_state = 'A';        # now require alpha
            }
            elsif ( $tok =~ /^\w/ ) {        # more alphanumeric is ok here
                $identifier .= $tok;
                $id_scan_state = ':';        # now need ::
                $saw_alpha     = 1;
            }
            elsif ( $tok eq "'" && $allow_tick ) {    # tick

if ( $is_keyword{$identifier} ) {
                    $id_scan_state = '';              # that's all
                    $i             = $i_save;
                }
                else {
                    $identifier .= $tok;
                }
            }
            elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
                $id_scan_state = '(';
                $identifier .= $tok;
            }
            elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
                $id_scan_state = ')';
                $identifier .= $tok;
            }
            else {
                $id_scan_state = '';        # that's all
                $i             = $i_save;
                last;
            }
        }

##############################
        # looking for '(' of prototype
        ##############################

elsif ( $id_scan_state eq '(' ) {

if ( $tok eq '(' ) {    # got it
                $identifier .= $tok;
                $id_scan_state = ')';    # now find the end of it
            }
            elsif ( $tok =~ /^\s*$/ ) {    # blank - keep going
                $identifier .= $tok;
                $tok_is_blank = 1;
            }
            else {
                $id_scan_state = '';        # that's all - no prototype
                $i             = $i_save;
                last;
            }
        }

##############################
        # looking for ')' of prototype
        ##############################

elsif ( $id_scan_state eq ')' ) {

$tok_is_blank = $tok =~ /^\s*$/;

if ( $tok eq ')' ) {    # got it
                $identifier .= $tok;
                $id_scan_state = '';    # all done
                last;
            }
            elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
                $identifier .= $tok;
            }
            else {    # probable error in script, but keep going
                warning("Unexpected '$tok' while seeking end of prototype\n");
                $identifier .= $tok;
            }
        }

###################
        # Starting sub call
        ###################

elsif ( $id_scan_state eq '&' ) {

if ( $tok =~ /^[\$\w]/ ) {    # alphanumeric ..
                $id_scan_state = ':';     # now need ::
                $saw_alpha     = 1;
                $identifier .= $tok;
            }
            elsif ( $tok eq "'" && $allow_tick ) {    # alphanumeric ..
                $id_scan_state = ':';                 # now need ::
                $saw_alpha     = 1;
                $identifier .= $tok;
            }
            elsif ( $tok =~ /^\s*$/ ) {               # allow space
                $tok_is_blank = 1;
            }
            elsif ( $tok eq '::' ) {                  # leading ::
                $id_scan_state = 'A';                 # accept alpha next
                $identifier .= $tok;
            }
            elsif ( $tok eq '{' ) {
                if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; }
                $i             = $i_save;
                $id_scan_state = '';
                last;
            }
            else {

# punctuation variable?
                # testfile: cunningham4.pl
                #
                # We have to be careful here.  If we are in an unknown state,
                # we will reject the punctuation variable.  In the following
                # example the '&' is a binary operator but we are in an unknown
                # state because there is no sigil on 'Prima', so we don't
                # know what it is.  But it is a bad guess that
                # '&~' is a function variable.
                # $self->{text}->{colorMap}->[
                #   Prima::PodView::COLOR_CODE_FOREGROUND
                #   & ~tb::COLOR_INDEX ] =
                #   $sec->{ColorCode}
                if ( $identifier eq '&' && $expecting ) {
                    $identifier .= $tok;
                }
                else {
                    $identifier = '';
                    $i          = $i_save;
                    $type       = '&';
                }
                $id_scan_state = '';
                last;
            }
        }

######################
        # unknown state - quit
        ######################

else {    # can get here due to error in initialization
            $id_scan_state = '';
            $i             = $i_save;
            last;
        }
    } ## end of main loop

if ( $id_scan_state eq ')' ) {
        warning("Hit end of line while seeking ) to end prototype\n");
    }

# once we enter the actual identifier, it may not extend beyond
    # the end of the current line
    if ( $id_scan_state =~ /^[A\:]/ ) {
        $id_scan_state = '';
    }

# Patch: the deprecated variable $# does not combine with anything on the
    # next line.
    if ( $identifier eq '$#' ) { $id_scan_state = '' }

if ( $i < 0 ) { $i = 0 }

# Be sure a token type is defined
    if ( !$type ) {

if ($saw_type) {

if ($saw_alpha) {
                if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
                    $type = 'w';
                }
                else { $type = 'i' }
            }
            elsif ( $identifier eq '->' ) {
                $type = '->';
            }
            elsif (
                ( length($identifier) > 1 )

# In something like '@$=' we have an identifier '@$'
                # In something like '$${' we have type '$$' (and only
                # part of an identifier)
                && !( $identifier =~ /\$$/ && $tok eq '{' )
                && ( $identifier !~ /^(sub |package )$/ )
              )
            {
                $type = 'i';
            }
            else { $type = 't' }
        }
        elsif ($saw_alpha) {

# type 'w' includes anything without leading type info
            # ($,%,@,*) including something like abc::def::ghi
            $type = 'w';
        }
        else {
            $type = '';
        }    # this can happen on a restart
    }

# See if we formed an identifier...
    if ($identifier) {
        $tok = $identifier;
        if ($message) { write_logfile_entry($message) }
    }

# did not find an identifier, back  up
    else {
        $tok = $tok_begin;
        $i   = $i_begin;
    }

DEBUG_SCAN_ID && do {
        my ( $a, $b, $c ) = caller;
        print STDOUT
"SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
        print STDOUT
"SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
    };
    return ( $i, $tok, $type, $id_scan_state, $identifier );
}

{    ## closure for sub do_scan_sub

# saved package and subnames in case prototype is on separate line
    my ( $package_saved, $subname_saved );

# initialize subname each time a new 'sub' keyword is encountered
    sub initialize_subname {
        $package_saved = "";
        $subname_saved = "";
        return;
    }

use constant {
        SUB_CALL       => 1,
        PAREN_CALL     => 2,
        PROTOTYPE_CALL => 3,
    };

sub do_scan_sub {

# do_scan_sub parses a sub name and prototype.

# At present there are three basic CALL TYPES which are
        # distinguished by the starting value of '$tok':
        # 1. $tok='sub', id_scan_state='sub'
        #    it is called with $i_beg equal to the index of the first nonblank
        #    token following a 'sub' token.
        # 2. $tok='(', id_scan_state='sub',
        #    it is called with $i_beg equal to the index of a '(' which may
        #    start a prototype.
        # 3. $tok='prototype', id_scan_state='prototype'
        #    it is called with $i_beg equal to the index of a '(' which is
        #    preceded by ': prototype' and has $id_scan_state eq 'prototype'

# Examples:

# A single type 1 call will get both the sub and prototype
        #   sub foo1 ( $$ ) { }
        #   ^

# The subname will be obtained with a 'sub' call
        # The prototype on line 2 will be obtained with a '(' call
        #   sub foo1
        #   ^                    <---call type 1
        #     ( $$ ) { }
        #     ^                  <---call type 2

# The subname will be obtained with a 'sub' call
        # The prototype will be obtained with a 'prototype' call
        #   sub foo1 ( $x, $y ) : prototype ( $$ ) { }
        #   ^ <---type 1                    ^ <---type 3

# TODO: add future error checks to be sure we have a valid
        # sub name.  For example, 'sub &doit' is wrong.  Also, be sure
        # a name is given if and only if a non-anonymous sub is
        # appropriate.
        # USES GLOBAL VARS: $current_package, $last_nonblank_token,
        # $in_attribute_list, %saw_function_definition,
        # $statement_type

my ($rinput_hash) = @_;

my $input_line      = $rinput_hash->{input_line};
        my $i               = $rinput_hash->{i};
        my $i_beg           = $rinput_hash->{i_beg};
        my $tok             = $rinput_hash->{tok};
        my $type            = $rinput_hash->{type};
        my $rtokens         = $rinput_hash->{rtokens};
        my $rtoken_map      = $rinput_hash->{rtoken_map};
        my $id_scan_state   = $rinput_hash->{id_scan_state};
        my $max_token_index = $rinput_hash->{max_token_index};

# Determine the CALL TYPE
        # 1=sub
        # 2=(
        # 3=prototype
        my $call_type =
            $tok eq 'prototype' ? PROTOTYPE_CALL
          : $tok eq '('         ? PAREN_CALL
          :                       SUB_CALL;

$id_scan_state = "";    # normally we get everything in one call
        my $subname = $subname_saved;
        my $package = $package_saved;
        my $proto   = undef;
        my $attrs   = undef;
        my $match;

my $pos_beg = $rtoken_map->[$i_beg];
        pos($input_line) = $pos_beg;

# Look for the sub NAME if this is a SUB call
        if (
               $call_type == SUB_CALL
            && $input_line =~ m/\G\s*
        ((?:\w*(?:'|::))*)  # package - something that ends in :: or '
        (\w+)               # NAME    - required
        /gcx
          )
        {
            $match   = 1;
            $subname = $2;

$package = ( defined($1) && $1 ) ? $1 : $current_package;
            $package =~ s/\'/::/g;
            if ( $package =~ /^\:/ ) { $package = 'main' . $package }
            $package =~ s/::$//;
            my $pos  = pos($input_line);
            my $numc = $pos - $pos_beg;
            $tok  = 'sub ' . substr( $input_line, $pos_beg, $numc );
            $type = 'i';

# remember the sub name in case another call is needed to
            # get the prototype
            $package_saved = $package;
            $subname_saved = $subname;
        }

# Now look for PROTO ATTRS for all call types
        # Look for prototype/attributes which are usually on the same
        # line as the sub name but which might be on a separate line.
        # For example, we might have an anonymous sub with attributes,
        # or a prototype on a separate line from its sub name

# NOTE: We only want to parse PROTOTYPES here. If we see anything that
        # does not look like a prototype, we assume it is a SIGNATURE and we
        # will stop and let the the standard tokenizer handle it.  In
        # particular, we stop if we see any nested parens, braces, or commas.
        # Also note, a valid prototype cannot contain any alphabetic character
        #  -- see https://perldoc.perl.org/perlsub
        # But it appears that an underscore is valid in a prototype, so the
        # regex below uses [A-Za-z] rather than \w
        # This is the old regex which has been replaced:
        # $input_line =~ m/\G(\s*$[^$$\}\{\,#]*$)?  # PROTO
        my $saw_opening_paren = $input_line =~ /\G\s*$/;
        if (
            $input_line =~ m/\G(\s*\([^$$\}\{\,#A-Za-z]*$)?  # PROTO
            (\s*:)?                              # ATTRS leading ':'
            /gcx
            && ( $1 || $2 )
          )
        {
            $proto = $1;
            $attrs = $2;

# Append the prototype to the starting token if it is 'sub' or
       # 'prototype'.  This is not necessary but for compatibility with previous
       # versions when the -csc flag is used:
            if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
                $tok .= $proto;
            }

# If we just entered the sub at an opening paren on this call, not
            # a following :prototype, label it with the previous token.  This is
            # necessary to propagate the sub name to its opening block.
            elsif ( $call_type == PAREN_CALL ) {
                $tok = $last_nonblank_token;
            }

$match ||= 1;
            $type = 'i';
        }

if ($match) {

# ATTRS: if there are attributes, back up and let the ':' be
            # found later by the scanner.
            my $pos = pos($input_line);
            if ($attrs) {
                $pos -= length($attrs);
            }

my $next_nonblank_token = $tok;

# catch case of line with leading ATTR ':' after anonymous sub
            if ( $pos == $pos_beg && $tok eq ':' ) {
                $type              = 'A';
                $in_attribute_list = 1;
            }

# Otherwise, if we found a match we must convert back from
            # string position to the pre_token index for continued parsing.
            else {

# I don't think an error flag can occur here ..but ?
                my $error;
                ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
                    $max_token_index );
                if ($error) { warning("Possibly invalid sub\n") }

# check for multiple definitions of a sub
                ( $next_nonblank_token, my $i_next ) =
                  find_next_nonblank_token_on_this_line( $i, $rtokens,
                    $max_token_index );
            }

if ( $next_nonblank_token =~ /^(\s*|#)$/ )
            {    # skip blank or side comment
                my ( $rpre_tokens, $rpre_types ) =
                  peek_ahead_for_n_nonblank_pre_tokens(1);
                if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
                    $next_nonblank_token = $rpre_tokens->[0];
                }
                else {
                    $next_nonblank_token = '}';
                }
            }

# See what's next...
            if ( $next_nonblank_token eq '{' ) {
                if ($subname) {

# Check for multiple definitions of a sub, but
                    # it is ok to have multiple sub BEGIN, etc,
                    # so we do not complain if name is all caps
                    if (   $saw_function_definition{$package}{$subname}
                        && $subname !~ /^[A-Z]+$/ )
                    {
                        my $lno = $saw_function_definition{$package}{$subname};
                        warning(
"already saw definition of 'sub $subname' in package '$package' at line $lno\n"
                        );
                    }
                    $saw_function_definition{$package}{$subname} =
                      $tokenizer_self->[_last_line_number_];
                }
            }
            elsif ( $next_nonblank_token eq ';' ) {
            }
            elsif ( $next_nonblank_token eq '}' ) {
            }

# ATTRS - if an attribute list follows, remember the name
            # of the sub so the next opening brace can be labeled.
            # Setting 'statement_type' causes any ':'s to introduce
            # attributes.
            elsif ( $next_nonblank_token eq ':' ) {
                $statement_type = $tok if ( $call_type == SUB_CALL );
            }

# if we stopped before an open paren ...
            elsif ( $next_nonblank_token eq '(' ) {

# If we DID NOT see this paren above then it must be on the
                # next line so we will set a flag to come back here and see if
                # it is a PROTOTYPE

# Otherwise, we assume it is a SIGNATURE rather than a
                # PROTOTYPE and let the normal tokenizer handle it as a list
                if ( !$saw_opening_paren ) {
                    $id_scan_state = 'sub';    # we must come back to get proto
                }
                $statement_type = $tok if ( $call_type == SUB_CALL );
            }
            elsif ($next_nonblank_token) {     # EOF technically ok
                $subname = "" unless defined($subname);
                warning(
"expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
                );
            }
            check_prototype( $proto, $package, $subname );
        }

# no match to either sub name or prototype, but line not blank
        else {

}
        return ( $i, $tok, $type, $id_scan_state );
    }
}

#########i###############################################################
# Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
#########################################################################

sub find_next_nonblank_token {
    my ( $i, $rtokens, $max_token_index ) = @_;

if ( $i >= $max_token_index ) {
        if ( !peeked_ahead() ) {
            peeked_ahead(1);
            $rtokens =
              peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
        }
    }
    my $next_nonblank_token = $rtokens->[ ++$i ];

if ( $next_nonblank_token =~ /^\s*$/ ) {
        $next_nonblank_token = $rtokens->[ ++$i ];
    }
    return ( $next_nonblank_token, $i );
}

sub numerator_expected {

# this is a filter for a possible numerator, in support of guessing
    # for the / pattern delimiter token.
    # returns -
    #   1 - yes
    #   0 - can't tell
    #  -1 - no
    # Note: I am using the convention that variables ending in
    # _expected have these 3 possible values.
    my ( $i, $rtokens, $max_token_index ) = @_;
    my $numerator_expected = 0;

my $next_token = $rtokens->[ $i + 1 ];
    if ( $next_token eq '=' ) { $i++; }    # handle /=
    my ( $next_nonblank_token, $i_next ) =
      find_next_nonblank_token( $i, $rtokens, $max_token_index );

if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
        $numerator_expected = 1;
    }
    else {

if ( $next_nonblank_token =~ /^\s*$/ ) {
            $numerator_expected = 0;
        }
        else {
            $numerator_expected = -1;
        }
    }
    return $numerator_expected;
}

{    ## closure for sub pattern_expected
    my %pattern_test;

BEGIN {

# List of tokens which may follow a pattern.  Note that we will not
        # have formed digraphs at this point, so we will see '&' instead of
        # '&&' and '|' instead of '||'

# /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
        my @q = qw( & && | || ? : + - * and or while if unless);
        push @q, ')', '}', ']', '>', ',', ';';
        @{pattern_test}{@q} = (1) x scalar(@q);
    }

sub pattern_expected {

# This a filter for a possible pattern.
        # It looks at the token after a possible pattern and tries to
        # determine if that token could end a pattern.
        # returns -
        #   1 - yes
        #   0 - can't tell
        #  -1 - no
        my ( $i, $rtokens, $max_token_index ) = @_;
        my $is_pattern = 0;

my $next_token = $rtokens->[ $i + 1 ];
        if ( $next_token =~ /^[msixpodualgc]/ ) {
            $i++;
        }    # skip possible modifier
        my ( $next_nonblank_token, $i_next ) =
          find_next_nonblank_token( $i, $rtokens, $max_token_index );

if ( $pattern_test{$next_nonblank_token} ) {
            $is_pattern = 1;
        }
        else {

if ( $next_nonblank_token =~ /^\s*$/ ) {
                $is_pattern = 0;
            }
            else {
                $is_pattern = -1;
            }
        }
        return $is_pattern;
    }
}

sub find_next_nonblank_token_on_this_line {
    my ( $i, $rtokens, $max_token_index ) = @_;
    my $next_nonblank_token;

if ( $i < $max_token_index ) {
        $next_nonblank_token = $rtokens->[ ++$i ];

if ( $next_nonblank_token =~ /^\s*$/ ) {

if ( $i < $max_token_index ) {
                $next_nonblank_token = $rtokens->[ ++$i ];
            }
        }
    }
    else {
        $next_nonblank_token = "";
    }
    return ( $next_nonblank_token, $i );
}

sub find_angle_operator_termination {

# We are looking at a '<' and want to know if it is an angle operator.
    # We are to return:
    #   $i = pretoken index of ending '>' if found, current $i otherwise
    #   $type = 'Q' if found, '>' otherwise
    my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
    my $i    = $i_beg;
    my $type = '<';
    pos($input_line) = 1 + $rtoken_map->[$i];

my $filter;

# we just have to find the next '>' if a term is expected
    if ( $expecting == TERM ) { $filter = '[\>]' }

# we have to guess if we don't know what is expected
    elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }

# shouldn't happen - we shouldn't be here if operator is expected
    else { warning("Program Bug in find_angle_operator_termination\n") }

# To illustrate what we might be looking at, in case we are
    # guessing, here are some examples of valid angle operators
    # (or file globs):
    #  <tmp_imp/*>
    #  <FH>
    #  <$fh>
    #  <*.c *.h>
    #  <_>
    #  <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
    #  <${PREFIX}*img*.$IMAGE_TYPE>
    #  <img*.$IMAGE_TYPE>
    #  <Timg*.$IMAGE_TYPE>
    #  <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
    #
    # Here are some examples of lines which do not have angle operators:
    #  return unless $self->[2]++ < $#{$self->[1]};
    #  < 2  || @$t >
    #
    # the following line from dlister.pl caused trouble:
    #  print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
    #
    # If the '<' starts an angle operator, it must end on this line and
    # it must not have certain characters like ';' and '=' in it.  I use
    # this to limit the testing.  This filter should be improved if
    # possible.

if ( $input_line =~ /($filter)/g ) {

if ( $1 eq '>' ) {

# We MAY have found an angle operator termination if we get
            # here, but we need to do more to be sure we haven't been
            # fooled.
            my $pos = pos($input_line);

my $pos_beg = $rtoken_map->[$i];
            my $str     = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );

# Reject if the closing '>' follows a '-' as in:
            # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
            if ( $expecting eq UNKNOWN ) {
                my $check = substr( $input_line, $pos - 2, 1 );
                if ( $check eq '-' ) {
                    return ( $i, $type );
                }
            }

######################################debug#####
            #write_diagnostics( "ANGLE? :$str\n");
            #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
            ######################################debug#####
            $type = 'Q';
            my $error;
            ( $i, $error ) =
              inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );

# It may be possible that a quote ends midway in a pretoken.
            # If this happens, it may be necessary to split the pretoken.
            if ($error) {
                warning(
                    "Possible tokinization error..please check this line\n");
                report_possible_bug();
            }

# count blanks on inside of brackets
            my $blank_count = 0;
            $blank_count++ if ( $str =~ /<\s+/ );
            $blank_count++ if ( $str =~ /\s+>/ );

# Now let's see where we stand....
            # OK if math op not possible
            if ( $expecting == TERM ) {
            }

# OK if there are no more than 2 non-blank pre-tokens inside
            # (not possible to write 2 token math between < and >)
            # This catches most common cases
            elsif ( $i <= $i_beg + 3 + $blank_count ) {

# No longer any need to document this common case
                ## write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
            }

# OK if there is some kind of identifier inside
            #   print $fh <tvg::INPUT>;
            elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
                write_diagnostics("ANGLE (contains identifier): $str\n");
            }

# Not sure..
            else {

# Let's try a Brace Test: any braces inside must balance
                my $br = 0;
                while ( $str =~ /\{/g ) { $br++ }
                while ( $str =~ /\}/g ) { $br-- }
                my $sb = 0;
                while ( $str =~ /\[/g ) { $sb++ }
                while ( $str =~ /\]/g ) { $sb-- }
                my $pr = 0;
                while ( $str =~ /$/g ) { $pr++ }
                while ( $str =~ /$/g ) { $pr-- }

# if braces do not balance - not angle operator
                if ( $br || $sb || $pr ) {
                    $i    = $i_beg;
                    $type = '<';
                    write_diagnostics(
                        "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
                }

# we should keep doing more checks here...to be continued
                # Tentatively accepting this as a valid angle operator.
                # There are lots more things that can be checked.
                else {
                    write_diagnostics(
                        "ANGLE-Guessing yes: $str expecting=$expecting\n");
                    write_logfile_entry("Guessing angle operator here: $str\n");
                }
            }
        }

# didn't find ending >
        else {
            if ( $expecting == TERM ) {
                warning("No ending > for angle operator\n");
            }
        }
    }
    return ( $i, $type );
}

sub scan_number_do {

#  scan a number in any of the formats that Perl accepts
    #  Underbars (_) are allowed in decimal numbers.
    #  input parameters -
    #      $input_line  - the string to scan
    #      $i           - pre_token index to start scanning
    #    $rtoken_map    - reference to the pre_token map giving starting
    #                    character position in $input_line of token $i
    #  output parameters -
    #    $i            - last pre_token index of the number just scanned
    #    number        - the number (characters); or undef if not a number

my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
    my $pos_beg = $rtoken_map->[$i];
    my $pos;
    my $i_begin = $i;
    my $number  = undef;
    my $type    = $input_type;

my $first_char = substr( $input_line, $pos_beg, 1 );

# Look for bad starting characters; Shouldn't happen..
    if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
        warning("Program bug - scan_number given character $first_char\n");
        report_definite_bug();
        return ( $i, $type, $number );
    }

# handle v-string without leading 'v' character ('Two Dot' rule)
    # (vstring.t)
    # TODO: v-strings may contain underscores
    pos($input_line) = $pos_beg;
    if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
        $pos = pos($input_line);
        my $numc = $pos - $pos_beg;
        $number = substr( $input_line, $pos_beg, $numc );
        $type   = 'v';
        report_v_string($number);
    }

# handle octal, hex, binary
    if ( !defined($number) ) {
        pos($input_line) = $pos_beg;

# Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
        # For reference, the format prior to hex floating point is:
        #   /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
        #             (hex)               (octal)   (binary)
        if (
            $input_line =~

/\G[+-]?0(                   # leading [signed] 0

# a hex float, i.e. '0x0.b17217f7d1cf78p0'
           ([xX][0-9a-fA-F_]*            # X and optional leading digits
           (\.([0-9a-fA-F][0-9a-fA-F_]*)?)?   # optional decimal and fraction
           [Pp][+-]?[0-9a-fA-F]          # REQUIRED exponent with digit
           [0-9a-fA-F_]*)                # optional Additional exponent digits

# or hex integer
           |([xX][0-9a-fA-F_]+)

# or octal fraction
           |([0-7_]+               # string of octal digits 
           (\.([0-7][0-7_]*)?)?    # optional decimal and fraction
           [Pp][+-]?[0-7]          # REQUIRED exponent, no underscore
           [0-7_]*)                # Additonal exponent digits, with underscores

# or octal integer
           |([0-7_]+)               # string of octal digits

# or a binary float
           |([bB][01_]*            # 'b' with string of binary digits 
           (\.([01][01_]*)?)?      # optional decimal and fraction
           [Pp][+-]?[01]           # Required exponent indicator, no underscore
           [01_]*)                 # additional exponent bits

# or binary integer
           |([bB][01_]+)           # 'b' with string of binary digits

)/gx
          )
        {
            $pos = pos($input_line);
            my $numc = $pos - $pos_beg;
            $number = substr( $input_line, $pos_beg, $numc );
            $type   = 'n';
        }
    }

# handle decimal
    if ( !defined($number) ) {
        pos($input_line) = $pos_beg;

if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
            $pos = pos($input_line);

# watch out for things like 0..40 which would give 0. by this;
            if (   ( substr( $input_line, $pos - 1, 1 ) eq '.' )
                && ( substr( $input_line, $pos, 1 ) eq '.' ) )
            {
                $pos--;
            }
            my $numc = $pos - $pos_beg;
            $number = substr( $input_line, $pos_beg, $numc );
            $type   = 'n';
        }
    }

# filter out non-numbers like e + - . e2  .e3 +e6
    # the rule: at least one digit, and any 'e' must be preceded by a digit
    if (
        $number !~ /\d/    # no digits
        || (   $number =~ /^(.*)[eE]/
            && $1 !~ /\d/ )    # or no digits before the 'e'
      )
    {
        $number = undef;
        $type   = $input_type;
        return ( $i, $type, $number );
    }

# Found a number; now we must convert back from character position
    # to pre_token index. An error here implies user syntax error.
    # An example would be an invalid octal number like '009'.
    my $error;
    ( $i, $error ) =
      inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
    if ($error) { warning("Possibly invalid number\n") }

return ( $i, $type, $number );
}

sub inverse_pretoken_map {

# Starting with the current pre_token index $i, scan forward until
    # finding the index of the next pre_token whose position is $pos.
    my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
    my $error = 0;

while ( ++$i <= $max_token_index ) {

if ( $pos <= $rtoken_map->[$i] ) {

# Let the calling routine handle errors in which we do not
            # land on a pre-token boundary.  It can happen by running
            # perltidy on some non-perl scripts, for example.
            if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
            $i--;
            last;
        }
    }
    return ( $i, $error );
}

sub find_here_doc {

# find the target of a here document, if any
    # input parameters:
    #   $i - token index of the second < of <<
    #   ($i must be less than the last token index if this is called)
    # output parameters:
    #   $found_target = 0 didn't find target; =1 found target
    #   HERE_TARGET - the target string (may be empty string)
    #   $i - unchanged if not here doc,
    #    or index of the last token of the here target
    #   $saw_error - flag noting unbalanced quote on here target
    my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
    my $ibeg                 = $i;
    my $found_target         = 0;
    my $here_doc_target      = '';
    my $here_quote_character = '';
    my $saw_error            = 0;
    my ( $next_nonblank_token, $i_next_nonblank, $next_token );
    $next_token = $rtokens->[ $i + 1 ];

# perl allows a backslash before the target string (heredoc.t)
    my $backslash = 0;
    if ( $next_token eq '\\' ) {
        $backslash  = 1;
        $next_token = $rtokens->[ $i + 2 ];
    }

( $next_nonblank_token, $i_next_nonblank ) =
      find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );

if ( $next_nonblank_token =~ /[\'\"\`]/ ) {

my $in_quote    = 1;
        my $quote_depth = 0;
        my $quote_pos   = 0;
        my $quoted_string;

(
            $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
            $quoted_string
          )
          = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
            $here_quote_character, $quote_pos, $quote_depth, $max_token_index );

if ($in_quote) {    # didn't find end of quote, so no target found
            $i = $ibeg;
            if ( $expecting == TERM ) {
                warning(
"Did not find here-doc string terminator ($here_quote_character) before end of line \n"
                );
                $saw_error = 1;
            }
        }
        else {              # found ending quote
            $found_target = 1;

my $tokj;
            foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
                $tokj = $rtokens->[$j];

# we have to remove any backslash before the quote character
                # so that the here-doc-target exactly matches this string
                next
                  if ( $tokj eq "\\"
                    && $j < $i - 1
                    && $rtokens->[ $j + 1 ] eq $here_quote_character );
                $here_doc_target .= $tokj;
            }
        }
    }

elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
        $found_target = 1;
        write_logfile_entry(
            "found blank here-target after <<; suggest using \"\"\n");
        $i = $ibeg;
    }
    elsif ( $next_token =~ /^\w/ ) {    # simple bareword or integer after <<

my $here_doc_expected;
        if ( $expecting == UNKNOWN ) {
            $here_doc_expected = guess_if_here_doc($next_token);
        }
        else {
            $here_doc_expected = 1;
        }

if ($here_doc_expected) {
            $found_target    = 1;
            $here_doc_target = $next_token;
            $i               = $ibeg + 1;
        }

}
    else {

if ( $expecting == TERM ) {
            $found_target = 1;
            write_logfile_entry("Note: bare here-doc operator <<\n");
        }
        else {
            $i = $ibeg;
        }
    }

# patch to neglect any prepended backslash
    if ( $found_target && $backslash ) { $i++ }

return ( $found_target, $here_doc_target, $here_quote_character, $i,
        $saw_error );
}

sub do_quote {

# follow (or continue following) quoted string(s)
    # $in_quote return code:
    #   0 - ok, found end
    #   1 - still must find end of quote whose target is $quote_character
    #   2 - still looking for end of first of two quotes
    #
    # Returns updated strings:
    #  $quoted_string_1 = quoted string seen while in_quote=1
    #  $quoted_string_2 = quoted string seen while in_quote=2
    my (
        $i,               $in_quote,    $quote_character,
        $quote_pos,       $quote_depth, $quoted_string_1,
        $quoted_string_2, $rtokens,     $rtoken_map,
        $max_token_index
    ) = @_;

my $in_quote_starting = $in_quote;

my $quoted_string;
    if ( $in_quote == 2 ) {    # two quotes/quoted_string_1s to follow
        my $ibeg = $i;
        (
            $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
            $quoted_string
          )
          = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
            $quote_pos, $quote_depth, $max_token_index );
        $quoted_string_2 .= $quoted_string;
        if ( $in_quote == 1 ) {
            if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
            $quote_character = '';
        }
        else {
            $quoted_string_2 .= "\n";
        }
    }

if ( $in_quote == 1 ) {    # one (more) quote to follow
        my $ibeg = $i;
        (
            $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
            $quoted_string
          )
          = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
            $quote_pos, $quote_depth, $max_token_index );
        $quoted_string_1 .= $quoted_string;
        if ( $in_quote == 1 ) {
            $quoted_string_1 .= "\n";
        }
    }
    return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
        $quoted_string_1, $quoted_string_2 );
}

sub follow_quoted_string {

# scan for a specific token, skipping escaped characters
    # if the quote character is blank, use the first non-blank character
    # input parameters:
    #   $rtokens = reference to the array of tokens
    #   $i = the token index of the first character to search
    #   $in_quote = number of quoted strings being followed
    #   $beginning_tok = the starting quote character
    #   $quote_pos = index to check next for alphanumeric delimiter
    # output parameters:
    #   $i = the token index of the ending quote character
    #   $in_quote = decremented if found end, unchanged if not
    #   $beginning_tok = the starting quote character
    #   $quote_pos = index to check next for alphanumeric delimiter
    #   $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
    #   $quoted_string = the text of the quote (without quotation tokens)
    my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
        $max_token_index )
      = @_;
    my ( $tok, $end_tok );
    my $i             = $i_beg - 1;
    my $quoted_string = "";

0 && do {
        print STDOUT
"QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
    };

# get the corresponding end token
    if ( $beginning_tok !~ /^\s*$/ ) {
        $end_tok = matching_end_token($beginning_tok);
    }

# a blank token means we must find and use the first non-blank one
    else {
        my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>

while ( $i < $max_token_index ) {
            $tok = $rtokens->[ ++$i ];

if ( $tok !~ /^\s*$/ ) {

if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
                    $i = $max_token_index;
                }
                else {

if ( length($tok) > 1 ) {
                        if ( $quote_pos <= 0 ) { $quote_pos = 1 }
                        $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
                    }
                    else {
                        $beginning_tok = $tok;
                        $quote_pos     = 0;
                    }
                    $end_tok     = matching_end_token($beginning_tok);
                    $quote_depth = 1;
                    last;
                }
            }
            else {
                $allow_quote_comments = 1;
            }
        }
    }

# There are two different loops which search for the ending quote
    # character.  In the rare case of an alphanumeric quote delimiter, we
    # have to look through alphanumeric tokens character-by-character, since
    # the pre-tokenization process combines multiple alphanumeric
    # characters, whereas for a non-alphanumeric delimiter, only tokens of
    # length 1 can match.

###################################################################
    # Case 1 (rare): loop for case of alphanumeric quote delimiter..
    # "quote_pos" is the position the current word to begin searching
    ###################################################################
    if ( $beginning_tok =~ /\w/ ) {

# Note this because it is not recommended practice except
        # for obfuscated perl contests
        if ( $in_quote == 1 ) {
            write_logfile_entry(
                "Note: alphanumeric quote delimiter ($beginning_tok) \n");
        }

while ( $i < $max_token_index ) {

if ( $quote_pos == 0 || ( $i < 0 ) ) {
                $tok = $rtokens->[ ++$i ];

if ( $tok eq '\\' ) {

# retain backslash unless it hides the end token
                    $quoted_string .= $tok
                      unless $rtokens->[ $i + 1 ] eq $end_tok;
                    $quote_pos++;
                    last if ( $i >= $max_token_index );
                    $tok = $rtokens->[ ++$i ];
                }
            }
            my $old_pos = $quote_pos;

unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
            {

}
            $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );

if ( $quote_pos > 0 ) {

$quoted_string .=
                  substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );

$quote_depth--;

if ( $quote_depth == 0 ) {
                    $in_quote--;
                    last;
                }
            }
            else {
                if ( $old_pos <= length($tok) ) {
                    $quoted_string .= substr( $tok, $old_pos );
                }
            }
        }
    }

########################################################################
    # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
    ########################################################################
    else {

while ( $i < $max_token_index ) {
            $tok = $rtokens->[ ++$i ];

if ( $tok eq $end_tok ) {
                $quote_depth--;

if ( $quote_depth == 0 ) {
                    $in_quote--;
                    last;
                }
            }
            elsif ( $tok eq $beginning_tok ) {
                $quote_depth++;
            }
            elsif ( $tok eq '\\' ) {

# retain backslash unless it hides the beginning or end token
                $tok = $rtokens->[ ++$i ];
                $quoted_string .= '\\'
                  unless ( $tok eq $end_tok || $tok eq $beginning_tok );
            }
            $quoted_string .= $tok;
        }
    }
    if ( $i > $max_token_index ) { $i = $max_token_index }
    return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
        $quoted_string );
}

sub indicate_error {
    my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
    interrupt_logfile();
    warning($msg);
    write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
    resume_logfile();
    return;
}

sub write_error_indicator_pair {
    my ( $line_number, $input_line, $pos, $carrat ) = @_;
    my ( $offset, $numbered_line, $underline ) =
      make_numbered_line( $line_number, $input_line, $pos );
    $underline = write_on_underline( $underline, $pos - $offset, $carrat );
    warning( $numbered_line . "\n" );
    $underline =~ s/\s*$//;
    warning( $underline . "\n" );
    return;
}

sub make_numbered_line {

#  Given an input line, its line number, and a character position of
    #  interest, create a string not longer than 80 characters of the form
    #     $lineno: sub_string
    #  such that the sub_string of $str contains the position of interest
    #
    #  Here is an example of what we want, in this case we add trailing
    #  '...' because the line is long.
    #
    # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
    #
    #  Here is another example, this time in which we used leading '...'
    #  because of excessive length:
    #
    # 2: ... er of the World Wide Web Consortium's
    #
    #  input parameters are:
    #   $lineno = line number
    #   $str = the text of the line
    #   $pos = position of interest (the error) : 0 = first character
    #
    #   We return :
    #     - $offset = an offset which corrects the position in case we only
    #       display part of a line, such that $pos-$offset is the effective
    #       position from the start of the displayed line.
    #     - $numbered_line = the numbered line as above,
    #     - $underline = a blank 'underline' which is all spaces with the same
    #       number of characters as the numbered line.

my ( $lineno, $str, $pos ) = @_;
    my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
    my $excess = length($str) - $offset - 68;
    my $numc   = ( $excess > 0 ) ? 68 : undef;

if ( defined($numc) ) {
        if ( $offset == 0 ) {
            $str = substr( $str, $offset, $numc - 4 ) . " ...";
        }
        else {
            $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
        }
    }
    else {

if ( $offset == 0 ) {
        }
        else {
            $str = "... " . substr( $str, $offset + 4 );
        }
    }

my $numbered_line = sprintf( "%d: ", $lineno );
    $offset -= length($numbered_line);
    $numbered_line .= $str;
    my $underline = " " x length($numbered_line);
    return ( $offset, $numbered_line, $underline );
}

sub write_on_underline {

# The "underline" is a string that shows where an error is; it starts
    # out as a string of blanks with the same length as the numbered line of
    # code above it, and we have to add marking to show where an error is.
    # In the example below, we want to write the string '--^' just below
    # the line of bad code:
    #
    # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
    #                 ---^
    # We are given the current underline string, plus a position and a
    # string to write on it.
    #
    # In the above example, there will be 2 calls to do this:
    # First call:  $pos=19, pos_chr=^
    # Second call: $pos=16, pos_chr=---
    #
    # This is a trivial thing to do with substr, but there is some
    # checking to do.

my ( $underline, $pos, $pos_chr ) = @_;

# check for error..shouldn't happen
    unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
        return $underline;
    }
    my $excess = length($pos_chr) + $pos - length($underline);
    if ( $excess > 0 ) {
        $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
    }
    substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
    return ($underline);
}

sub pre_tokenize {

# Break a string, $str, into a sequence of preliminary tokens.  We
    # are interested in these types of tokens:
    #   words       (type='w'),            example: 'max_tokens_wanted'
    #   digits      (type = 'd'),          example: '0755'
    #   whitespace  (type = 'b'),          example: '   '
    #   any other single character (i.e. punct; type = the character itself).
    # We cannot do better than this yet because we might be in a quoted
    # string or pattern.  Caller sets $max_tokens_wanted to 0 to get all
    # tokens.
    my ( $str, $max_tokens_wanted ) = @_;

# we return references to these 3 arrays:
    my @tokens    = ();     # array of the tokens themselves
    my @token_map = (0);    # string position of start of each token
    my @type      = ();     # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct

do {

# whitespace
        if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }

# numbers
        # note that this must come before words!
        elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }

# words
        elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }

# single-character punctuation
        elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }

# that's all..
        else {
            return ( \@tokens, \@token_map, \@type );
        }

push @tokens,    $1;
        push @token_map, pos($str);

} while ( --$max_tokens_wanted != 0 );

return ( \@tokens, \@token_map, \@type );
}

sub show_tokens {

# this is an old debug routine
    # not called, but saved for reference
    my ( $rtokens, $rtoken_map ) = @_;
    my $num = scalar( @{$rtokens} );

foreach my $i ( 0 .. $num - 1 ) {
        my $len = length( $rtokens->[$i] );
        print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
    }
    return;
}

{    ## closure for sub matching end token
    my %matching_end_token;

BEGIN {
        %matching_end_token = (
            '{' => '}',
            '(' => ')',
            '[' => ']',
            '<' => '>',
        );
    }

sub matching_end_token {

# return closing character for a pattern
        my $beginning_token = shift;
        if ( $matching_end_token{$beginning_token} ) {
            return $matching_end_token{$beginning_token};
        }
        return ($beginning_token);
    }
}

sub dump_token_types {
    my ( $class, $fh ) = @_;

# This should be the latest list of token types in use
    # adding NEW_TOKENS: add a comment here
    $fh->print(<<'END_OF_LIST');

Here is a list of the token types currently used for lines of type 'CODE'.  
For the following tokens, the "type" of a token is just the token itself.

.. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
( ) <= >= == =~ !~ != ++ -- /= x=
... **= <<= >>= &&= ||= //= <=> 
, + - / * | % ! x ~ = \ ? : . < > ^ &

The following additional token types are defined:

type    meaning
    b    blank (white space) 
    {    indent: opening structural curly brace or square bracket or paren
         (code block, anonymous hash reference, or anonymous array reference)
    }    outdent: right structural curly brace or square bracket or paren
    [    left non-structural square bracket (enclosing an array index)
    ]    right non-structural square bracket
    (    left non-structural paren (all but a list right of an =)
    )    right non-structural paren
    L    left non-structural curly brace (enclosing a key)
    R    right non-structural curly brace 
    ;    terminal semicolon
    f    indicates a semicolon in a "for" statement
    h    here_doc operator <<
    #    a comment
    Q    indicates a quote or pattern
    q    indicates a qw quote block
    k    a perl keyword
    C    user-defined constant or constant function (with void prototype = ())
    U    user-defined function taking parameters
    G    user-defined function taking block parameter (like grep/map/eval)
    M    (unused, but reserved for subroutine definition name)
    P    (unused, but -html uses it to label pod text)
    t    type indicater such as %,$,@,*,&,sub
    w    bare word (perhaps a subroutine call)
    i    identifier of some type (with leading %, $, @, *, &, sub, -> )
    n    a number
    v    a v-string
    F    a file test operator (like -e)
    Y    File handle
    Z    identifier in indirect object slot: may be file handle, object
    J    LABEL:  code block label
    j    LABEL after next, last, redo, goto
    p    unary +
    m    unary -
    pp   pre-increment operator ++
    mm   pre-decrement operator -- 
    A    : used as attribute separator
    
    Here are the '_line_type' codes used internally:
    SYSTEM         - system-specific code before hash-bang line
    CODE           - line of perl code (including comments)
    POD_START      - line starting pod, such as '=head'
    POD            - pod documentation text
    POD_END        - last line of pod section, '=cut'
    HERE           - text of here-document
    HERE_END       - last line of here-doc (target word)
    FORMAT         - format section
    FORMAT_END     - last line of format section, '.'
    DATA_START     - __DATA__ line
    DATA           - unidentified text following __DATA__
    END_START      - __END__ line
    END            - unidentified text following __END__
    ERROR          - we are in big trouble, probably not a perl script
END_OF_LIST

return;
}

BEGIN {

# These names are used in error messages
    @opening_brace_names = qw# '{' '[' '(' '?' #;
    @closing_brace_names = qw# '}' ']' ')' ':' #;

my @q;

my @digraphs = qw(
      .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
      <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
    );
    @is_digraph{@digraphs} = (1) x scalar(@digraphs);

my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
    @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);

my @tetragraphs = qw( <<>> );
    @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);

# make a hash of all valid token types for self-checking the tokenizer
    # (adding NEW_TOKENS : select a new character and add to this list)
    my @valid_token_types = qw#
      A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
      { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
      #;
    push( @valid_token_types, @digraphs );
    push( @valid_token_types, @trigraphs );
    push( @valid_token_types, @tetragraphs );
    push( @valid_token_types, ( '#', ',', 'CORE::' ) );
    @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);

# a list of file test letters, as in -e (Table 3-4 of 'camel 3')
    my @file_test_operators =
      qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
    @is_file_test_operator{@file_test_operators} =
      (1) x scalar(@file_test_operators);

# these functions have prototypes of the form (&), so when they are
    # followed by a block, that block MAY BE followed by an operator.
    # Smartmatch operator ~~ may be followed by anonymous hash or array ref
    @q = qw( do eval );
    @is_block_operator{@q} = (1) x scalar(@q);

# these functions allow an identifier in the indirect object slot
    @q = qw( print printf sort exec system say);
    @is_indirect_object_taker{@q} = (1) x scalar(@q);

# These tokens may precede a code block
    # patched for SWITCH/CASE/CATCH.  Actually these could be removed
    # now and we could let the extended-syntax coding handle them.
    # Added 'default' for Switch::Plain.
    @q =
      qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
      unless do while until eval for foreach map grep sort
      switch case given when default catch try finally);
    @is_code_block_token{@q} = (1) x scalar(@q);

# I'll build the list of keywords incrementally
    my @Keywords = ();

# keywords and tokens after which a value or pattern is expected,
    # but not an operator.  In other words, these should consume terms
    # to their right, or at least they are not expected to be followed
    # immediately by operators.
    my @value_requestor = qw(
      AUTOLOAD
      BEGIN
      CHECK
      DESTROY
      END
      EQ
      GE
      GT
      INIT
      LE
      LT
      NE
      UNITCHECK
      abs
      accept
      alarm
      and
      atan2
      bind
      binmode
      bless
      break
      caller
      chdir
      chmod
      chomp
      chop
      chown
      chr
      chroot
      close
      closedir
      cmp
      connect
      continue
      cos
      crypt
      dbmclose
      dbmopen
      defined
      delete
      die
      dump
      each
      else
      elsif
      eof
      eq
      evalbytes
      exec
      exists
      exit
      exp
      fc
      fcntl
      fileno
      flock
      for
      foreach
      formline
      ge
      getc
      getgrgid
      getgrnam
      gethostbyaddr
      gethostbyname
      getnetbyaddr
      getnetbyname
      getpeername
      getpgrp
      getpriority
      getprotobyname
      getprotobynumber
      getpwnam
      getpwuid
      getservbyname
      getservbyport
      getsockname
      getsockopt
      glob
      gmtime
      goto
      grep
      gt
      hex
      if
      index
      int
      ioctl
      join
      keys
      kill
      last
      lc
      lcfirst
      le
      length
      link
      listen
      local
      localtime
      lock
      log
      lstat
      lt
      map
      mkdir
      msgctl
      msgget
      msgrcv
      msgsnd
      my
      ne
      next
      no
      not
      oct
      open
      opendir
      or
      ord
      our
      pack
      pipe
      pop
      pos
      print
      printf
      prototype
      push
      quotemeta
      rand
      read
      readdir
      readlink
      readline
      readpipe
      recv
      redo
      ref
      rename
      require
      reset
      return
      reverse
      rewinddir
      rindex
      rmdir
      scalar
      seek
      seekdir
      select
      semctl
      semget
      semop
      send
      sethostent
      setnetent
      setpgrp
      setpriority
      setprotoent
      setservent
      setsockopt
      shift
      shmctl
      shmget
      shmread
      shmwrite
      shutdown
      sin
      sleep
      socket
      socketpair
      sort
      splice
      split
      sprintf
      sqrt
      srand
      stat
      state
      study
      substr
      symlink
      syscall
      sysopen
      sysread
      sysseek
      system
      syswrite
      tell
      telldir
      tie
      tied
      truncate
      uc
      ucfirst
      umask
      undef
      unless
      unlink
      unpack
      unshift
      untie
      until
      use
      utime
      values
      vec
      waitpid
      warn
      while
      write
      xor

switch
      case
      default
      given
      when
      err
      say
      isa

catch
    );

# patched above for SWITCH/CASE given/when err say
    # 'err' is a fairly safe addition.
    # Added 'default' for Switch::Plain. Note that we could also have
    # a separate set of keywords to include if we see 'use Switch::Plain'
    push( @Keywords, @value_requestor );

# These are treated the same but are not keywords:
    my @extra_vr = qw(
      constant
      vars
    );
    push( @value_requestor, @extra_vr );

@expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);

# this list contains keywords which do not look for arguments,
    # so that they might be followed by an operator, or at least
    # not a term.
    my @operator_requestor = qw(
      endgrent
      endhostent
      endnetent
      endprotoent
      endpwent
      endservent
      fork
      getgrent
      gethostent
      getlogin
      getnetent
      getppid
      getprotoent
      getpwent
      getservent
      setgrent
      setpwent
      time
      times
      wait
      wantarray
    );

push( @Keywords, @operator_requestor );

# These are treated the same but are not considered keywords:
    my @extra_or = qw(
      STDERR
      STDIN
      STDOUT
    );

push( @operator_requestor, @extra_or );

@expecting_operator_token{@operator_requestor} =
      (1) x scalar(@operator_requestor);

# these token TYPES expect trailing operator but not a term
    # note: ++ and -- are post-increment and decrement, 'C' = constant
    my @operator_requestor_types = qw( ++ -- C <> q );
    @expecting_operator_types{@operator_requestor_types} =
      (1) x scalar(@operator_requestor_types);

# these token TYPES consume values (terms)
    # note: pp and mm are pre-increment and decrement
    # f=semicolon in for,  F=file test operator
    my @value_requestor_type = qw#
      L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
      **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
      <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
      f F pp mm Y p m U J G j >> << ^ t
      ~. ^. |. &. ^.= |.= &.=
      #;
    push( @value_requestor_type, ',' )
      ;    # (perl doesn't like a ',' in a qw block)
    @expecting_term_types{@value_requestor_type} =
      (1) x scalar(@value_requestor_type);

# Note: the following valid token types are not assigned here to
    # hashes requesting to be followed by values or terms, but are
    # instead currently hard-coded into sub operator_expected:
    # ) -> :: Q R Z ] b h i k n v w } #

# For simple syntax checking, it is nice to have a list of operators which
    # will really be unhappy if not followed by a term.  This includes most
    # of the above...
    %really_want_term = %expecting_term_types;

# with these exceptions...
    delete $really_want_term{'U'}; # user sub, depends on prototype
    delete $really_want_term{'F'}; # file test works on $_ if no following term
    delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
                                   # let perl do it

@q = qw(q qq qw qx qr s y tr m);
    @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);

@q = qw(package);
    @is_package{@q} = (1) x scalar(@q);

@q = qw( ? : );
    push @q, ',';
    @is_comma_question_colon{@q} = (1) x scalar(@q);

# Hash of other possible line endings which may occur.
    # Keep these coordinated with the regex where this is used.
    # Note: chr(13) = chr(015)="\r".
    @q = ( chr(13), chr(29), chr(26) );
    @other_line_endings{@q} = (1) x scalar(@q);

# These keywords are handled specially in the tokenizer code:
    my @special_keywords = qw(
      do
      eval
      format
      m
      package
      q
      qq
      qr
      qw
      qx
      s
      sub
      tr
      y
    );
    push( @Keywords, @special_keywords );

# Keywords after which list formatting may be used
    # WARNING: do not include |map|grep|eval or perl may die on
    # syntax errors (map1.t).
    my @keyword_taking_list = qw(
      and
      chmod
      chomp
      chop
      chown
      dbmopen
      die
      elsif
      exec
      fcntl
      for
      foreach
      formline
      getsockopt
      if
      index
      ioctl
      join
      kill
      local
      msgctl
      msgrcv
      msgsnd
      my
      open
      or
      our
      pack
      print
      printf
      push
      read
      readpipe
      recv
      return
      reverse
      rindex
      seek
      select
      semctl
      semget
      send
      setpriority
      setsockopt
      shmctl
      shmget
      shmread
      shmwrite
      socket
      socketpair
      sort
      splice
      split
      sprintf
      state
      substr
      syscall
      sysopen
      sysread
      sysseek
      system
      syswrite
      tie
      unless
      unlink
      unpack
      unshift
      until
      vec
      warn
      while
      given
      when
    );
    @is_keyword_taking_list{@keyword_taking_list} =
      (1) x scalar(@keyword_taking_list);

# perl functions which may be unary operators.

# This list is used to decide if a pattern delimited by slashes, /pattern/,
    # can follow one of these keywords.
    @q = qw(
      chomp eof eval fc lc pop shift uc undef
    );

@is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
      (1) x scalar(@q);

# These are keywords for which an arg may optionally be omitted.  They are
    # currently only used to disambiguate a ? used as a ternary from one used
    # as a (depricated) pattern delimiter.  In the future, they might be used
    # to give a warning about ambiguous syntax before a /.
    # Note: split has been omitted (see not below).
    my @keywords_taking_optional_arg = qw(
      abs
      alarm
      caller
      chdir
      chomp
      chop
      chr
      chroot
      close
      cos
      defined
      die
      eof
      eval
      evalbytes
      exit
      exp
      fc
      getc
      glob
      gmtime
      hex
      int
      last
      lc
      lcfirst
      length
      localtime
      log
      lstat
      mkdir
      next
      oct
      ord
      pop
      pos
      print
      printf
      prototype
      quotemeta
      rand
      readline
      readlink
      readpipe
      redo
      ref
      require
      reset
      reverse
      rmdir
      say
      select
      shift
      sin
      sleep
      sqrt
      srand
      stat
      study
      tell
      uc
      ucfirst
      umask
      undef
      unlink
      warn
      write
    );
    @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
      (1) x scalar(@keywords_taking_optional_arg);

# This list is used to decide if a pattern delmited by question marks,
    # ?pattern?, can follow one of these keywords.  Note that from perl 5.22
    # on, a ?pattern? is not recognized, so we can be much more strict than
    # with a /pattern/. Note that 'split' is not in this list. In current
    # versions of perl a question following split must be a ternary, but
    # in older versions it could be a pattern.  The guessing algorithm will
    # decide.  We are combining two lists here to simplify the test.
    @q = ( @keywords_taking_optional_arg, @operator_requestor );
    @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
      (1) x scalar(@q);

# These are not used in any way yet
    #    my @unused_keywords = qw(
    #     __FILE__
    #     __LINE__
    #     __PACKAGE__
    #     );

#  The list of keywords was originally extracted from function 'keyword' in
    #  perl file toke.c version 5.005.03, using this utility, plus a
    #  little editing: (file getkwd.pl):
    #  while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
    #  Add 'get' prefix where necessary, then split into the above lists.
    #  This list should be updated as necessary.
    #  The list should not contain these special variables:
    #  ARGV DATA ENV SIG STDERR STDIN STDOUT
    #  __DATA__ __END__

@is_keyword{@Keywords} = (1) x scalar(@Keywords);
}
1;