Gelex: Scanner Skeleton

Scanner Skeleton
note

    description: "General lexical analyzers"
    library: "Gobo Eiffel Lexical Library"
    copyright: "Copyright (c) 2001-2019, Eric Bezault and others"
    license: "MIT License"

deferred class YY_SCANNER

create

    make
            -- Create a new scanner with
            -- standard input as input file.

    make_with_file (a_file: KI_CHARACTER_INPUT_STREAM)
            -- Create a new scanner with
            -- a_file as input file.
            -- To be used when a_file contains ISO-8859-1 characters,
            -- or when it is using the UTF-8 encoding and the scanner is
            -- either using the "%option utf8" or has been manually written
            -- to expect sequences of UTF-8 bytes.
        require
            a_file_not_void: a_file /= Void
            a_file_open_read: a_file.is_open_read

    make_with_unicode_file (a_file: KI_CHARACTER_INPUT_STREAM)
            -- Create a new scanner with a_file as input file.
            -- a_file is expected to be encoded in UTF-8
            -- or ISO-8859-1, and the input buffer will handle
            -- the corresponding Unicode characters.
        require
            a_file_not_void: a_file /= Void
            a_file_open_read: a_file.is_open_read

    make_with_buffer (a_buffer: like input_buffer)
            -- Create a new scanner with
            -- a_buffer as input buffer.
        require
            a_buffer_not_void: a_buffer /= Void
        ensure
            input_buffer_set: input_buffer = a_buffer

feature -- Initialization

    reset
            -- Reset scanner before scanning next input source.
            -- (This routine can be called in wrap before scanning
            -- another input buffer.)

    reset_with_file (a_file: KI_CHARACTER_INPUT_STREAM)
            -- Reset scanner before scanning next input source.
            -- Then reuse input_buffer and set it to a_file
            -- if it was a file buffer, create a new file input buffer
            -- with a_file otherwise.
        require
            a_file_not_void: a_file /= Void
            a_file_open_read: a_file.is_open_read

    reset_with_string (a_string: STRING_8)
            -- Reset scanner before scanning next input source.
            -- Then reuse input_buffer and set it to a_string
            -- if it was not Empty_buffer, create a new input buffer
            -- with a_string otherwise.
        require
            a_string_not_void: a_string /= Void

    reset_start_condition
            -- Clear pushed start conditions and set start_condition
            -- to the "INITIAL" start condition.
        ensure
            pushed_start_conditions_cleared: pushed_start_condition_count = 0

feature -- Access

    last_token: INTEGER
            -- Code of last token read
            -- (0 means that the end-of-input has been reached,
            -- non-positive values mean that an error occurred
            -- (see header-comment of scanning_error.))

    text: STRING_8
            -- Text of last token read
            -- (Create a new string at each call.)
            --
            -- Note that if input_buffer contains Unicode characters
            -- which cannot be represented as 8-bit characters, they
            -- will be replaced by a replacement character specified
            -- in the buffer.
        ensure
            text_not_void: Result /= Void
            correct_count: Result.count = text_count

    unicode_text: STRING_32
            -- Unicode text of last token read
            -- (Create a new string at each call.)
            --
            -- Note that if the scanner is written to receive sequences
            -- of UTF-8 bytes, unicode_text will treat each single
            -- byte as a character. It will not try to decode the UTF-8 bytes
            -- into Unicode characters.
            --
            -- Note that unicode_text does not contain surrogate
            -- or invalid Unicode characters.
        ensure
            unicode_text_not_void: Result /= Void
            correct_count: Result.count = text_count

    utf8_text: STRING_8
            -- UTF-8 representation of last token read
            -- (Create a new string at each call.)
            --
            -- Note that unicode_text does not contain surrogate
            -- or invalid Unicode characters, therefore the resulting
            -- string is valid UTF-8.
        ensure
            utf8_text_not_void: Result /= Void
            utf8_text_is_string_8: Result.same_type ({STRING_8} "")
            valid_utf8: {UC_UTF8_ROUTINES}.valid_utf8 (Result)
            correct_count: Result.count = {UC_UTF8_ROUTINES}.string_byte_count (unicode_text)
            definition: Result.is_equal ({UC_UTF8_ROUTINES}.string_to_utf8 (unicode_text))

    text_item (i: INTEGER): CHARACTER_8
            -- i-th character of last token read
        require
            i_large_enough: i >= 1
            i_small_enough: i <= text_count
        ensure
            definition: Result = text.item (i)

    unicode_text_item (i: INTEGER): CHARACTER_32
            -- i-th Unicode character of last token read
            --
            -- Note that unicode_text does not contain surrogate
            -- or invalid Unicode characters.
        require
            i_large_enough: i >= 1
            i_small_enough: i <= text_count
        ensure
            definition: Result = unicode_text.item (i)

    text_substring (s, e: INTEGER): STRING
            -- Substring of last token read
            -- (Create a new string at each call.)
            -- (For efficiency reason, this function can bypass the
            -- call to text and create the substring directly from
            -- the input buffer.)
        require
            meaningful_start: 1 <= s
            meaningful_interval: s <= e + 1
            meaningful_end: e <= text_count
        ensure
            text_substring_not_void: Result /= Void
            text_substring_empty: (s > e) implies Result.is_empty
            definition: Result.is_equal (text.substring (s, e))

    unicode_text_substring (s, e: INTEGER): STRING_32
            -- Unicode substring of last token read
            -- (Create a new string at each call.)
            -- (For efficiency reason, this function can bypass the
            -- call to unicode_text and create the substring directly from
            -- the input buffer.)
            --
            -- Note that unicode_text does not contain surrogate
            -- or invalid Unicode characters.
        require
            meaningful_start: 1 <= s
            meaningful_interval: s <= e + 1
            meaningful_end: e <= text_count
        ensure
            unicode_text_substring_not_void: Result /= Void
            unicode_text_substring_empty: (s > e) implies Result.is_empty
            definition: Result.is_equal (unicode_text.substring (s, e))

    utf8_text_substring (s, e: INTEGER): STRING_8
            -- UTF-8 representation of substring of last token read
            -- (Create a new string at each call.)
            -- (For efficiency reason, this function can bypass the
            -- call to unicode_text and create the substring directly from
            -- the input buffer.)
            --
            -- Note that unicode_text does not contain surrogate
            -- or invalid Unicode characters, therefore the resulting
            -- string is valid UTF-8.
        require
            meaningful_start: 1 <= s
            meaningful_interval: s <= e + 1
            meaningful_end: e <= text_count
        ensure
            utf8_text_not_void: Result /= Void
            utf8_text_is_string_8: Result.same_type ({STRING_8} "")
            valid_utf8: {UC_UTF8_ROUTINES}.valid_utf8 (Result)
            utf8_text_substring_empty: (s > e) implies Result.is_empty
            definition: Result.is_equal ({UC_UTF8_ROUTINES}.string_to_utf8 (unicode_text.substring (s, e)))
            correct_count: Result.count = {UC_UTF8_ROUTINES}.string_byte_count (unicode_text.substring (s, e))

    start_condition: INTEGER
            -- Start condition

feature -- Measurement

    text_count: INTEGER
            -- Length of last token read
        ensure
            text_count_not_negative: Result >= 0

    line: INTEGER
            -- Line number of last token read when
            -- '%option line' has been specified
        ensure
            line_positive: Result >= 1

    column: INTEGER
            -- Column number of last token read when
            -- '%option line' has been specified
        ensure
            column_positive: Result >= 1

    position: INTEGER
            -- Position of last token read (i.e. number of
            -- characters from the start of the input source)
            -- when '%option position' has been specified
        ensure
            position_positive: Result >= 1

    pushed_start_condition_count: INTEGER
            -- Number of start conditions already pushed (and not popped yet)
        ensure
            pushed_start_condition_count_not_negative: Result >= 0

feature -- Status report

    end_of_file: BOOLEAN
            -- Has the end of input buffer been reached?
            -- This means that last_token has been set
            -- to 0 indicating "all done".

    scanning_error: BOOLEAN
            -- Has an error occurred during scanning?
            -- This can occur when too many reject are called (and hence
            -- nothing can be matched anymore) or when the option "nodefault"
            -- (or option -s) has been specified but the default rule is
            -- matched nevertheless.

    valid_start_condition (sc: INTEGER): BOOLEAN
            -- Is sc a valid start condition?

feature -- Setting

    set_last_token (a_token: INTEGER)
            -- Set last_token to a_token.
        ensure
            last_token_set: last_token = a_token

    set_start_condition (a_start_condition: INTEGER)
            -- Set start_condition to a_start_condition.
        require
            valid_start_condition: valid_start_condition (a_start_condition)
        ensure
            start_condition_set: start_condition = a_start_condition

    push_start_condition (a_start_condition: INTEGER)
            -- Set start condition and add previous to stack.
        require
            valid_start_condition: valid_start_condition (a_start_condition)
        ensure
            start_condition_set: start_condition = a_start_condition
            one_more: pushed_start_condition_count = old pushed_start_condition_count + 1

    pop_start_condition
            -- Restore previous start condition.
        require
            has_pushed_start_conditions: pushed_start_condition_count > 0
        ensure
            one_less: pushed_start_condition_count = old pushed_start_condition_count - 1

feature -- Scanning

    scan
            -- Scan input_buffer until end of file is found
            -- or an error occurs.
        ensure
            end_of_file: not scanning_error implies end_of_file

    read_token
            -- Read a token from input_buffer.
            -- Make result available in last_token.

feature -- Element change

    append_text_to_string (a_string: STRING_8)
            -- Append text at end of a_string.
            -- (For efficiency reason, this feature can bypass the
            -- call to text and directly copy the characters from
            -- the input buffer.)
        require
            a_string_not_void: a_string /= Void
        ensure
            count_set: a_string.count = old (a_string.count) + text_count
            definition: a_string.substring (old (a_string.count) + 1, a_string.count).same_string (text)

    append_unicode_text_to_string (a_string: STRING_32)
            -- Append unicode_text at end of a_string.
            -- (For efficiency reason, this feature can bypass the
            -- call to unicode_text and directly copy the characters from
            -- the input buffer.)
        require
            a_string_not_void: a_string /= Void
        ensure
            count_set: a_string.count = old (a_string.count) + text_count
            definition: a_string.substring (old (a_string.count) + 1, a_string.count).same_string (unicode_text)

    append_utf8_text_to_string (a_string: STRING_8)
            -- Append utf8_text at end of a_string.
            -- (For efficiency reason, this feature can bypass the
            -- call to utf8_text and directly copy the characters from
            -- the input buffer.)
        require
            a_string_not_void: a_string /= Void
            a_string_is_string_8: a_string.same_type ({STRING_8} "")
        ensure
            count_set: a_string.count = old (a_string.count) + utf8_text.count
            definition: a_string.substring (old (a_string.count) + 1, a_string.count).same_string (utf8_text)

    append_text_substring_to_string (s, e: INTEGER; a_string: STRING_8)
            -- Append text_substring at end of a_string.
            -- (For efficiency reason, this feature can bypass
            -- the call to text_substring and directly copy
            -- the characters from the input buffer.)
        require
            a_string_not_void: a_string /= Void
            s_large_enough: 1 <= s
            valid_interval: s <= e + 1
            e_small_enough: e <= text_count
        ensure
            count_set: a_string.count = old (a_string.count) + (e - s + 1)
            definition: a_string.substring (old (a_string.count) + 1, a_string.count).same_string (text_substring (s, e))

    append_unicode_text_substring_to_string (s, e: INTEGER; a_string: STRING_32)
            -- Append unicode_text_substring at end of a_string.
            -- (For efficiency reason, this feature can bypass
            -- the call to unicode_text_substring and directly copy
            -- the characters from the input buffer.)
        require
            a_string_not_void: a_string /= Void
            s_large_enough: 1 <= s
            valid_interval: s <= e + 1
            e_small_enough: e <= text_count
        ensure
            count_set: a_string.count = old (a_string.count) + (e - s + 1)
            definition: a_string.substring (old (a_string.count) + 1, a_string.count).same_string (unicode_text_substring (s, e))

    append_utf8_text_substring_to_string (s, e: INTEGER; a_string: STRING_8)
            -- Append utf8_text_substring at end of a_string.
            -- (For efficiency reason, this feature can bypass the
            -- call to utf8_text_substring and directly copy the characters from
            -- the input buffer.)
        require
            a_string_not_void: a_string /= Void
            a_string_is_string_8: a_string.same_type ({STRING_8} "")
            s_large_enough: 1 <= s
            valid_interval: s <= e + 1
            e_small_enough: e <= text_count
        ensure
            count_set: a_string.count = old (a_string.count) + utf8_text_substring (s, e).count
            definition: a_string.substring (old (a_string.count) + 1, a_string.count).same_string (utf8_text_substring (s, e))

    terminate
            -- Terminate scanner and set last_token
            -- to 0 indicating "all done".

    wrap: BOOLEAN
            -- Should current scanner terminate when end of file is reached?
            -- This function can be redefined to switch to another input
            -- buffer (but don't forget to update start_condition).
            -- (Default: True.)

    more
            -- Tell scanner to append the next matched token
            -- to current value of text instead of
            -- replacing it.

    less (n: INTEGER)
            -- Return all but the first n matched
            -- characters back to input_buffer.
        require
            n_large_enough: n >= 0
            n_small_enough: n <= text_count
        ensure
            text_count_set: text_count = n

    unread_character (c: CHARACTER_8)
            -- Put c back to input_buffer. This will alter both
            -- text and the content of input_buffer.

    unread_unicode_character (c: CHARACTER_32)
            -- Put c back to input_buffer. This will alter both
            -- unicode_text and the content of input_buffer.
            -- The behavior is undefined if c is too large to fit into input_buffer.

    read_character
            -- Read a character from input_buffer.
            -- Make result available in last_character and last_unicode_character.
            --
            -- Note that if input_buffer contains Unicode characters
            -- which cannot be represented as 8-bit characters, they
            -- will be replaced by a replacement character specified
            -- in the buffer.

    last_character: CHARACTER_8
            -- Last character read by read_character

    last_unicode_character: CHARACTER_32
            -- Last Unicode character read by read_character

feature -- Input

    input_buffer: YY_BUFFER
            -- Input buffer

    set_input_buffer (a_buffer: like input_buffer)
            -- Set input_buffer to a_buffer.
        require
            a_buffer_not_void: a_buffer /= Void
        ensure
            input_buffer_set: input_buffer = a_buffer

    flush_input_buffer
            -- Flush input_buffer. input_buffer will be automatically
            -- refilled unless end of file has been found.
        ensure
            flushed: input_buffer.count = 0

    new_file_buffer (a_file: KI_CHARACTER_INPUT_STREAM): YY_FILE_BUFFER
            -- New input buffer for a_file.
            -- To be used when a_file contains ISO-8859-1 characters,
            -- or when it is using the UTF-8 encoding and the scanner is
            -- either using the "%option utf8" or has been manually written
            -- to expect sequences of UTF-8 bytes.
        require
            a_file_not_void: a_file /= Void
            a_file_open_read: a_file.is_open_read
        ensure
            new_buffer_not_void: Result /= Void

    new_unicode_file_buffer (a_file: KI_CHARACTER_INPUT_STREAM): YY_UNICODE_FILE_BUFFER
            -- New Unicode input buffer for a_file.
            -- a_file is expected to be encoded in UTF-8
            -- or ISO-8859-1.
            -- The scanner will receive Unicode characters,
            -- not sequences of UTF-8 bytes.
        require
            a_file_not_void: a_file /= Void
            a_file_open_read: a_file.is_open_read
        ensure
            new_unicode_file_buffer_not_void: Result /= Void

    new_utf8_file_buffer (a_file: KI_CHARACTER_INPUT_STREAM): YY_UTF8_FILE_BUFFER
            -- New UTF-8 input buffer for a_file.
            -- To be used when a_file contains ISO-8859-1 characters or when it
            -- is using the UTF-8 encoding, and the scanner is either using the
            -- "%option utf8" or has been manually written to expect sequences
            -- of UTF-8 bytes.
            -- The scanner will receive sequences of UTF-8 bytes.
        require
            a_file_not_void: a_file /= Void
            a_file_open_read: a_file.is_open_read
        ensure
            new_utf8_file_buffer_not_void: Result /= Void

    new_string_buffer (a_string: STRIN_8): YY_BUFFER
            -- New input buffer for a_string.
            -- To be used when a_string contains ISO-8859-1 characters,
            -- or when it is using the UTF-8 encoding and the scanner is
            -- either using the "%option utf8" or has been manually written
            -- to expect sequences of UTF-8 bytes.
        require
            a_string_not_void: a_string /= Void
            a_string_is_string: a_string.same_type ({STRING_8} "")
        ensure
            new_buffer_not_void: Result /= Void

    new_unicode_string_buffer (a_string: READABLE_STRING_GENERAL): YY_UNICODE_BUFFER
            -- New Unicode input buffer for a_string.
            -- To be used when a_string contains ISO-8859-1 or Unicode characters.
            -- The scanner will receive Unicode characters, not sequences of UTF-8 bytes.
        require
            a_string_not_void: a_string /= Void
        ensure
            new_unicode_string_buffer_not_void: Result /= Void

    new_utf8_string_buffer (a_string: READABLE_STRING_GENERAL): YY_UTF8_BUFFER
            -- New UTF-8 input buffer for a_string.
            -- To be used when a_string contains ISO-8859-1 or Unicode characters,
            -- and the scanner is either using the "%option utf8" or has been
            -- manually written to expect sequences of UTF-8 bytes.
            -- a_string is expected to contain valid non-surrogate Unicode
            -- characters. Invalid or surrogate Unicode characters are encoded
            -- with one byte 0xFF (which is an invalid byte in UTF-8).
            -- The scanner will receive sequences of UTF-8 bytes.
        require
            a_string_not_void: a_string /= Void
        ensure
            new_utf8_string_buffer_not_void: Result /= Void

    Empty_buffer: YY_BUFFER
            -- Empty input buffer
        ensure
            empty_buffer_not_void: Result /= Void

feature -- Output

    output (a_text: like text)
            -- Output a_text.
            -- (Note: this routine can be redefined in descendant
            -- classes. Default: print a_text to standard output.)
        require
            a_text_not_void: a_text /= Void

    echo
            -- Output text using feature output.

feature -- Action

    pre_action
            -- Action executed before every semantic action
            -- when '%option pre-action' has been specified.
            -- (Note: this routine can be redefined in descendant
            -- classes. Default: do nothing.)

    post_action
            -- Action executed after every semantic action
            -- when '%option post-action' has been specified.
            -- (Note: this routine can be redefined in descendant
            -- classes. Default: do nothing.)

    pre_eof_action
            -- Action executed before every end-of-file semantic action
            -- (i.e. <<EOF>>) when '%option pre-eof-action' has been specified.
            -- (Note: this routine can be redefined in descendant classes.
            -- Default: do nothing.)

    post_eof_action
            -- Action executed after every end-of-file semantic action
            -- (i.e. <<EOF>>) when '%option post-eof-action' has been specified.
            -- (Note: this routine can be redefined in descendant classes.
            -- Default: do nothing.)

    default_action
            -- Action executed when default rule is matched.
            -- (Note: this routine can be redefined in descendant classes.
            -- Default: print last character read to standard output.)

feature -- Error handling

    fatal_error (a_message: STRING_8)
            -- A fatal error occurred.
            -- Print a_message.
        require
            a_message_not_void: a_message /= Void

    report_invalid_unicode_character_error (a_code: NATURAL_32)
            -- Report that the surrogate or invalid Unicode character
            -- with code a_code has been read from the input
            -- buffer and caused the scanner to fail.

feature -- Debugging

    print_last_token
            -- Print to standard error debug information
            -- about the last token read. Can be redefined
            -- in descendant classes to print more information.
            -- (Called at the end of read_token when compiled
            -- with 'debug ("GELEX")' enabled).

invariant

    input_buffer_not_void: input_buffer /= Void
    valid_start_condition: valid_start_condition (start_condition)

end
Copyright © 2001-2019, Eric Bezault and others
mailto:ericb@gobosoft.com
http://www.gobosoft.com
Last Updated: 28 September 2019