#!/usr/bin/perl -w
#
# LEXSYM - Lexical analysis and symbol table generation script
#
# @ Author: Loren Segal
# @ Date: February 2005
#
# Usage: ./lexsym [source_file.t]
# (If a file is not given in command line, it will be prompted via the script)
#
# Description:
# This script will analyze a T source file and generate a symbol table for
# all of the identifiers which were matched. It also does basic code parsing.
# It should be stressed however, how limited the code parsing actually is, and
# because of this, not all generated error messages are nessarily intuitive to
# the user. The main goal of this script is to generate a proper symbol table,
# NOT to properly parse code or give descriptive error messages. With that said,
# expect to be able to 'break' the error generation system fairly easily (eg.
# unclosed strings, unclosed block comments, invalid array assignments, etc.)
#
# Note for SOEN229: The requirements state that identifier,arithmetic,bracket
# number,etc. must be recognized by the parser. All of these
# requirements have been met. In addition, however, strings
# and line/block comments are also recognized with a C-style
# syntax. I did this for personal knowledge, and while you
# are free to test them, please do not take off marks for
# the simple fact that they are defined, thanks :-).
#
use strict; # be strict about programming style
####
### BEGIN PROTOTYPES
####
# symbol table functions
sub init_symtable($);
sub load_symtable();
sub add_symbol($@);
sub lookup_symbol($);
sub print_symtable();
# code parsing functions
sub code_parse($);
sub syntax_error($);
# callback parse functions
sub parse_identifier($);
sub parse_brace($);
sub parse_newline($);
sub parse_bcomment($);
# main/misc prototypes
sub main();
####
### END PROTOTYPES
####
# GLOBAL VARIABLES
my(%SYMTAB); # Global Symbol Table
# PARSE TABLE - Contains all lexical analysis definitions and callback subs
#
# The code_parse() function will use this table to test all of the regex's to
# match any token in the grammar. If none of the following definitions are
# matched, an 'invalid character' error will be thrown. Upon a match, a
# callback subroutine will be called if there is one defined. There is also a
# special key 'silent' which will hide default output of the recognized token,
# this is specifically used for newline matches.
#
# Keys:
# 'regex' => contains regex information for the grammar definition
# 'callback' => a possible subroutine to handle the definition in more depth-
# this is usually for error checking, rather than simple symbol
# table work.
# 'silent' => a special key to hide the default output by the token
# recognition loop.
#
my(%PARSETABLE) = (
'Identifier' => {
'regex' => qr/[a-z_]\w*/,
'callback' => \&parse_identifier
},
'Unsigned Value' => {
'regex' => qr/\d+(?:\.\d+)?/,
'callback' => undef
},
'String Value' => {
'regex' => qr/(['"]).*?[^\\]\1|(['"])\2/, #'"
'callback' => undef
},
'Increment' => {
'regex' => qr/[+-]{2}(?![+-])/,
'callback' => undef
},
'Arithmetic' => {
'regex' => qr/[+-](?![+-])/,
'callback' => undef
},
'Relational' => {
'regex' => qr/[<>]=?/,
'callback' => undef
},
'Assignment' => {
'regex' => qr/=/,
'callback' => undef
},
'Equality' => {
'regex' => qr/(?:!=|==)/,
'callback' => undef
},
'Brackets' => {
'regex' => qr/[()]/,
'callback' => undef
},
'Braces' => {
'regex' => qr/[{}]/,
'callback' => \&parse_brace
},
'End_Statement' => {
'regex' => qr/;/,
'callback' => undef
},
'Next Argument' => {
'regex' => qr/,/,
'callback' => undef
},
'Next Line' => { # Adding a nextline token match instead
'regex' => qr/[\n\r]+/, # of going line by line allows for multi
'callback' => \&parse_newline, # line comments and also takes out one
'silent' => 1 # loop-nest of the code_parse loop.
},
'Line Comment' => { # Thankfully, '.' does not match newlines
'regex' => qr|//.*|, # unless the //s switch is applied.
'callback' => undef
},
'Block Comment' => {
'regex' => qr|/\*.*\*/|s, # Here we take advantages of m//s for
'callback' => \&parse_bcomment # comments that can span multiple lines.
}
);
# STATE table - Holds all information during a code_parse
# note: originally, %STATE was a local variable to code_parse.
# However, as its size grew, it seemed unorganized to define
# such a large (and important) hash locally within a sub.
#
# Keys:
# 'word' => The current identifier token, all other tokens are ignored
# 'last_word' => The last identifier
# 'scope' => The scope of the program. Every opening brace increases this
# count by one, and vice versa for a close brace. Any scope > 0
# is considered local, and 0 is considered global.
# 'line_number' => The current line number of the lexical parser
#
my(%STATE) = (
'word' => undef,
'last_word' => undef,
'scope' => 0,
'line_number' => 1
);
####
### BEGIN PROGRAM LISTING
####
main(); # Call main()
####
### END PROGRAM
####
sub main()
{
my($file);
init_symtable("symbolfile.txt"); # initiate symbol table, hardcoded location
load_symtable(); # load keyval pairs into table
print_symtable(); # print symbol table before parsing
# Get the filename
if ($ARGV[0])
{
$file = $ARGV[0]; # take the source file from command line
} # if it was specified.
else
{
print "Enter source file: "; # if no source file is entered via command
chomp($file = <STDIN>); # line, it will prompt the user for one.
}
code_parse($file); # parse the file
print_symtable(); # print the generated symbol table
}
####
### SYMBOL TABLE FUNCTIONS
####
## Function: init_symbtable(FILENAME)
#
## Description: Opens the symbol table for reading under the filehandle SYMTAB
#
sub init_symtable($)
{
open(SYMTAB, $_[0]) or die("% Error: could not open file '$_[0]': $!\n");
}
## Function: load_symtable()
#
## Description: Adds all of the key/value pairs to their
# appropriate symbol name in the SYMTAB.
#
sub load_symtable()
{
local $_;
my($line, $valid) = (0, 1);
for (<SYMTAB>)
{
chomp;
next if (!$_ || m/^\s*#.*/); # ignore commented and empty lines
# -- NOT required by assignment.
$line++;
my($name) = m/^\s*(\w+):\s*\w+=\w+/; # matches-> NAME: *=* ...
my(@matches) = m/(\w*)=(\w*)/g; # matches-> *=*
$valid = 1;
for(my($i) = 0; $i <= $#matches; $i++)
{
if ($matches[$i] eq '') { $valid = 0; last; }
}
if (!$name || !$valid)
{
# Give an error if a line of the symbol table file is corrupted or
# improperly defined.
die("% Error: symbol table has invalid definition at line $line.\n");
}
# Add the symbol- it is valid.
add_symbol($name, @matches);
}
close(SYMTAB);
}
## Function: add_symbol(NAME, HASH_VALUES)
#
## Description:
# Adds a symbol NAME to the SYMTABLE along with its key/value pairs defined by
# HASH_VALUES
#
sub add_symbol($@)
{
my($name, %symbols) = @_;
return if (!values(%symbols));
$symbols{'count'} = 0; # Force count to be initialized at 0,
# even if it is not declared or set as something
# else in the symbol file, which is not legal.
$SYMTAB{$name} = \%symbols;
}
## Function: lookup_symbol(NAME)
#
## Description:
# Looks up a symbol in SYMTAB and returns true or false (undef) if it exists
# or not, respectively.
#
sub lookup_symbol($)
{
my($name) = @_;
return (defined($SYMTAB{$name}) ? !undef : undef);
}
## Function: print_symtable()
#
## Description:
# Prints the SYMTAB in a human readable format to the screen in a table.
# This function will make two passes, scanning the SYMBOL TABLE once to
# find all of the keys that need to be printed, and then to actually print
# all of the keys with the values for each symbol.
#
sub print_symtable()
{
my $range = 16; # Put a max of 16 spaces between each key
my $subrange = $range - 1; # Needed to trimming
my($width, $key, $name, $value, $msg, %list);
# Do first pass, find all the keys that were set by symbol definitions.
# This is to allow other definitions besides subtype,type,count,scope
while (($name, $value) = (each %SYMTAB))
{
for $key (keys %$value) { $list{$key} = $key; }
}
$width = (keys(%list) + 1) * $range; # Get width of the table
# Print the header with all the keys generated by the first pass
print ''.('-' x $width) . "\n";
print "Symbol Table\nname". (' ' x ($range - length("name")));
for $key (sort keys %list)
{
print $key . (' ' x ($range - length($key)));
}
print "\n".('-' x $width)."\n\n";
# Done header
# Print the body, if a key is not defined for a certain symbol, print ---
for $name (sort keys %SYMTAB)
{
print $name . (' ' x ($range - length($name)));
for $key (sort keys %list)
{
$value = $SYMTAB{$name}{$key};
if (!defined($SYMTAB{$name}{$key})) { $value = '---'; }
$value =~ s/(.{1,$subrange}).*/$1/; # Chop off the end if it is too long
# for the column.
print $value . (' ' x ($range - length($value)));
}
print "\n";
}
# Done body
print ''.('-' x $width)."\n";
}
####
### END SYMBOL TABLE FUNCTIONS
####
####
### CODE PARSING FUNCTIONS
####
## Function: code_parse(FILENAME)
#
## Description:
# This is the true heart of the program. All of the lexical analysis is done
# here. This function will make use of the %PARSETABLE along with all of its
# grammar definitions and callback functions. This function will also
# automatically report all recognized tokens unless otherwise specified.
#
sub code_parse($)
{
local $_;
my($file, $found_match) = @_; # file is arg1, found_match will not be set
# Open file, read into one big scalar, and close
open(CODE, $file) or die "% Error: could not open file '$file': $!\n";
$_ = join "", <CODE>;
close CODE;
while ($_) {
$found_match = 0;
while (my($name, $values) = each(%PARSETABLE))
{
s/^[ \t]*//; # Get rid of extraneous spaces.
last if (!$_); # If we removed the last of $_, leave.
my($re, $callback, $silent) =
($values->{'regex'}, $values->{'callback'}, $values->{'silent'});
if (m/^$re/i) {
$found_match = 1;
print "# Found [$name]: $&\n" if (!$silent);
$STATE{'word'} = $&; # update word in STATE
# Call callback function for any data handling.
&$callback($&) if ($callback && defined(&$callback));
}
$_ = $'; #' Goto next token
# Log state changes / update current state
$STATE{'last_word'} = $STATE{'word'};
}
# A character was found that is not in the grammar of the language.
syntax_error("Invalid character in context: '".substr($_, 0, 1)."'")
if (!$found_match && $_);
}
}
## Function: syntax_error(REASON)
#
## Description:
# This function will throw an error and exit the script, returning a failure
# value to the system.
#
sub syntax_error($)
{
#print_symtable(); (DEBUGGING ONLY)
print "% Parse error at line $STATE{'line_number'}: $_[0]\n";
exit(1);
}
###
## CUSTOM PARSING AND CALLBACK FUNCTIONS
###
## Function: parse_identifier(ID_NAME)
#
## Description:
# The main purpose of this function is to add an identifier to the SYMTABLE
# if it does not yet exist, and increase its count value if it does. This
# function will also do basic syntax checking on variable declarations (not
# an assignment requirement).
#
sub parse_identifier($)
{
my($match, $last) = @_; # $last does not get set.
if (lookup_symbol($match)) {
# Found an existing symbol, update its counter
# unless it was redeclared as another type, which it illegal.
$last = $STATE{'last_word'};
if ($last && lookup_symbol($last) &&
$SYMTAB{$last}{'subtype'} eq 'declaration' &&
$SYMTAB{$match}{'subtype'} ne $last &&
$SYMTAB{$match}{'subtype'} ne 'special_entry_function')
{
syntax_error("variable $match (type='$SYMTAB{$match}{'subtype'}') ".
"cannot be redeclared as type $last.");
}
$SYMTAB{$match}{'count'}++; # increase counter
}
else {
# Found a new symbol, check if the prior symbol was a valid declaration
# symbol, and add it if it was.
$last = $STATE{'last_word'};
if (lookup_symbol($last) && $SYMTAB{$last}{'subtype'} eq 'declaration')
{
print "@ Found new symbol: $match (subtype='$last', ".
"scope=".($STATE{'scope'}>0?'local':'global').")\n";
add_symbol($match,
(
'type' => "variable",
'subtype' => $last,
'scope' => ($STATE{'scope'} > 0 ? "local" : "global"),
'count' => 1
)
);
}
else {
# The symbol was not used in a correct context, throw an error:
if (lookup_symbol($last)) {
# 1. The prior symbol was not a declaration / variable type.
syntax_error("Invalid variable declaration for ".
"$last $match, invalid type.");
}
else {
# 2. There was no declaration symbol at all.
syntax_error("Variable $match not previously declared.");
}
}
}
}
## Function: parse_brace('{' OR '}')
#
## Description:
# This function will set the scope of the current code at the location of the
# parser. While it does support more than just a 'global' and 'local' scoping
# system, it is not implemented since the assignment used a global/local
# system.
#
sub parse_brace($)
{
my($match) = @_;
if ($match eq '{') { $STATE{'scope'}++; }
if ($match eq '}') { $STATE{'scope'}--; }
}
## Function: parse_newline($)
#
## Description:
# Increases the line counter when a newline is encountered.
#
sub parse_newline($)
{
$STATE{'line_number'}++; # Go to next line
}
## Function: parse_bcomment(COMMENT)
#
## Description:
# Because a block comment can have newlines in it, we must account for them
# by counting how many exist and adding it to our line_number STATE value.
#
sub parse_bcomment($)
{
# Since \n can exist without \r, but not the other way around, we only need
# to count \n in the string.
$STATE{'line_number'} += ($_[0] =~ tr/\n/\n/);
}
####
### END CODE PARSING FUNCTIONS
#### Powered by
GeSHi Syntax Highlighting software.
Author of all (other) material unless otherwise specified:
Loren Segal. Copyright 2005.