Viewing file: soen229/lexsym.pl | Back to directory listing
Author: Loren Segal | Last modified: February 20 2006 07:00 pm | Download

#!/usr/bin/perl -w
#
# LEXSYM - Lexical analysis and symbol table generation script
# 
# @ Author: Loren Segal 
# @ Date: February 2005
#
# Usage: ./lexsym [source_file.t]
# (If a file is not given in command line, it will be prompted via the script)
#
# Description:
# This script will analyze a T source file and generate a symbol table for
# all of the identifiers which were matched. It also does basic code parsing.
# It should be stressed however, how limited the code parsing actually is, and
# because of this, not all generated error messages are nessarily intuitive to
# the user. The main goal of this script is to generate a proper symbol table,
# NOT to properly parse code or give descriptive error messages. With that said,
# expect to be able to 'break' the error generation system fairly easily (eg.
# unclosed strings, unclosed block comments, invalid array assignments, etc.)
#
# Note for SOEN229: The requirements state that identifier,arithmetic,bracket
#                   number,etc. must be recognized by the parser. All of these
#                   requirements have been met. In addition, however, strings
#                   and line/block comments are also recognized with a C-style
#                   syntax. I did this for personal knowledge, and while you
#                   are free to test them, please do not take off marks for
#                   the simple fact that they are defined, thanks :-).
#
use strict;         # be strict about programming style
 
####
### BEGIN PROTOTYPES
####
 
# symbol table functions
sub init_symtable($);
sub load_symtable();
sub add_symbol($@);
sub lookup_symbol($);
sub print_symtable();
 
# code parsing functions
sub code_parse($);
sub syntax_error($);
# callback parse functions
sub parse_identifier($);
sub parse_brace($);
sub parse_newline($);
sub parse_bcomment($);
 
# main/misc prototypes
sub main();
 
####
### END PROTOTYPES
####
 
# GLOBAL VARIABLES
my(%SYMTAB);       # Global Symbol Table
 
# PARSE TABLE - Contains all lexical analysis definitions and callback subs
#
# The code_parse() function will use this table to test all of the regex's to
# match any token in the grammar. If none of the following definitions are
# matched, an 'invalid character' error will be thrown. Upon a match, a
# callback subroutine will be called if there is one defined. There is also a
# special key 'silent' which will hide default output of the recognized token,
# this is specifically used for newline matches.
#
# Keys:
# 'regex'    => contains regex information for the grammar definition
# 'callback' => a possible subroutine to handle the definition in more depth-
#               this is usually for error checking, rather than simple symbol
#               table work.
# 'silent'   => a special key to hide the default output by the token
#               recognition loop.
#
my(%PARSETABLE) = (
  'Identifier' => {
    'regex'    => qr/[a-z_]\w*/,
    'callback' => \&parse_identifier
  },
  'Unsigned Value' => {
    'regex'    => qr/\d+(?:\.\d+)?/,
    'callback' => undef
  },
  'String Value' => {
    'regex'    => qr/(['"]).*?[^\\]\1|(['"])\2/, #'"
    'callback' => undef
  },
  'Increment'  => {
    'regex'    => qr/[+-]{2}(?![+-])/,
    'callback' => undef
  },
  'Arithmetic' => {
    'regex'    => qr/[+-](?![+-])/,
    'callback' => undef
  },
  'Relational' => {
    'regex'    => qr/[<>]=?/,
    'callback' => undef
  },
  'Assignment' => {
    'regex'    => qr/=/,
    'callback' => undef
  },
  'Equality'   => {
    'regex'    => qr/(?:!=|==)/,
    'callback' => undef
  },
  'Brackets' => {
    'regex'    => qr/[()]/,
    'callback' => undef
  },
  'Braces' => {
    'regex'    => qr/[{}]/,
    'callback' => \&parse_brace
  },
  'End_Statement' => {
    'regex'    => qr/;/,
    'callback' => undef
  },
  'Next Argument' => {
    'regex'    => qr/,/,
    'callback' => undef
  },
  'Next Line' => {                   # Adding a nextline token match instead
    'regex'    => qr/[\n\r]+/,       # of going line by line allows for multi
    'callback' => \&parse_newline,   # line comments and also takes out one
    'silent'   => 1                  # loop-nest of the code_parse loop.
  },
  'Line Comment' => {                # Thankfully, '.' does not match newlines
    'regex'    => qr|//.*|,          # unless the //s switch is applied.
    'callback' => undef
  },
  'Block Comment' => {
    'regex'    => qr|/\*.*\*/|s,     # Here we take advantages of m//s for
    'callback' => \&parse_bcomment   # comments that can span multiple lines.
  }
);
 
# STATE table - Holds all information during a code_parse
# note: originally, %STATE was a local variable to code_parse.
#       However, as its size grew, it seemed unorganized to define
#       such a large (and important) hash locally within a sub.
#
# Keys:
# 'word'        => The current identifier token, all other tokens are ignored
# 'last_word'   => The last identifier
# 'scope'       => The scope of the program. Every opening brace increases this
#                  count by one, and vice versa for a close brace. Any scope > 0
#                  is considered local, and 0 is considered global.
# 'line_number' => The current line number of the lexical parser
#
my(%STATE) = (
  'word' => undef,
  'last_word' => undef,
  'scope' => 0,
  'line_number' => 1
);
 
####
### BEGIN PROGRAM LISTING
####
main();          # Call main()
####
### END PROGRAM
####
 
sub main()
{
  my($file);
  init_symtable("symbolfile.txt");  # initiate symbol table, hardcoded location
  load_symtable();                  # load keyval pairs into table
  print_symtable();                 # print symbol table before parsing
  # Get the filename
  if ($ARGV[0])
  {
    $file = $ARGV[0];               # take the source file from command line
  }                                 # if it was specified.
  else
  { 
    print "Enter source file: ";    # if no source file is entered via command
    chomp($file = <STDIN>);         # line, it will prompt the user for one.
  }
  code_parse($file);                # parse the file
  print_symtable();                 # print the generated symbol table
}
 
####
### SYMBOL TABLE FUNCTIONS
####
 
## Function: init_symbtable(FILENAME)
#
## Description: Opens the symbol table for reading under the filehandle SYMTAB
#
sub init_symtable($)
{
  open(SYMTAB, $_[0]) or die("% Error: could not open file '$_[0]': $!\n");
}
 
## Function: load_symtable()
#
## Description: Adds all of the key/value pairs to their 
#               appropriate symbol name in the SYMTAB.
#
sub load_symtable()
{
  local $_;
  my($line, $valid) = (0, 1);
  for (<SYMTAB>)
  {
    chomp;
    next if (!$_ || m/^\s*#.*/);          # ignore commented and empty lines
                                          # -- NOT required by assignment.
    $line++;
    my($name) = m/^\s*(\w+):\s*\w+=\w+/;  # matches-> NAME: *=* ...
    my(@matches) = m/(\w*)=(\w*)/g;       # matches-> *=*
    $valid = 1;
    for(my($i) = 0; $i <= $#matches; $i++)
    {
      if ($matches[$i] eq '') { $valid = 0; last; }
    }
    if (!$name || !$valid) 
    {
      # Give an error if a line of the symbol table file is corrupted or
      # improperly defined.
      die("% Error: symbol table has invalid definition at line $line.\n");
    }
    # Add the symbol- it is valid.
    add_symbol($name, @matches);
  }
  close(SYMTAB);
}
 
## Function: add_symbol(NAME, HASH_VALUES)
#
## Description:
# Adds a symbol NAME to the SYMTABLE along with its key/value pairs defined by
# HASH_VALUES
#
sub add_symbol($@)
{
  my($name, %symbols) = @_;
  return if (!values(%symbols));
  $symbols{'count'} = 0;     # Force count to be initialized at 0,
                             # even if it is not declared or set as something
                             # else in the symbol file, which is not legal.
  $SYMTAB{$name} = \%symbols;
}
 
## Function: lookup_symbol(NAME)
#
## Description:
# Looks up a symbol in SYMTAB and returns true or false (undef) if it exists
# or not, respectively.
#
sub lookup_symbol($) 
{
  my($name) = @_;
  return (defined($SYMTAB{$name}) ? !undef : undef);
}
 
## Function: print_symtable()
#
## Description:
# Prints the SYMTAB in a human readable format to the screen in a table.
# This function will make two passes, scanning the SYMBOL TABLE once to
# find all of the keys that need to be printed, and then to actually print
# all of the keys with the values for each symbol.
#
sub print_symtable()
{
  my $range = 16;              # Put a max of 16 spaces between each key
  my $subrange = $range - 1;   # Needed to trimming
  my($width, $key, $name, $value, $msg, %list);
  # Do first pass, find all the keys that were set by symbol definitions.
  # This is to allow other definitions besides subtype,type,count,scope
  while (($name, $value) = (each %SYMTAB))
  {
    for $key (keys %$value) { $list{$key} = $key; }
  }
  $width = (keys(%list) + 1) * $range; # Get width of the table
  
  # Print the header with all the keys generated by the first pass
  print ''.('-' x $width) . "\n";
  print "Symbol Table\nname". (' ' x ($range - length("name")));
  for $key (sort keys %list)
  {
    print $key . (' ' x ($range - length($key)));
  }
  print "\n".('-' x $width)."\n\n";
  # Done header
  
  # Print the body, if a key is not defined for a certain symbol, print ---
  for $name (sort keys %SYMTAB)
  {
    print $name . (' ' x ($range - length($name)));
    for $key (sort keys %list)
    {
      $value = $SYMTAB{$name}{$key};
      if (!defined($SYMTAB{$name}{$key})) { $value = '---'; }
      $value =~ s/(.{1,$subrange}).*/$1/; # Chop off the end if it is too long
                                          # for the column.
      print $value . (' ' x ($range - length($value)));
    }
    print "\n";
  }
  # Done body
  print ''.('-' x $width)."\n";
}
####
### END SYMBOL TABLE FUNCTIONS
####
 
####
### CODE PARSING FUNCTIONS
####
 
## Function: code_parse(FILENAME)
#
## Description:
# This is the true heart of the program. All of the lexical analysis is done
# here. This function will make use of the %PARSETABLE along with all of its
# grammar definitions and callback functions. This function will also
# automatically report all recognized tokens unless otherwise specified.
#
sub code_parse($)
{
  local $_;
  my($file, $found_match) = @_;   # file is arg1, found_match will not be set
 
  # Open file, read into one big scalar, and close
  open(CODE, $file) or die "% Error: could not open file '$file': $!\n";
  $_ = join "", <CODE>;
  close CODE;
 
  while ($_) {
    $found_match = 0;
    while (my($name, $values) = each(%PARSETABLE))
    {
      s/^[ \t]*//;               # Get rid of extraneous spaces.
      last if (!$_);             # If we removed the last of $_, leave.
      my($re, $callback, $silent) =
        ($values->{'regex'}, $values->{'callback'}, $values->{'silent'});
      if (m/^$re/i) {
        $found_match = 1;
        print "# Found [$name]: $&\n" if (!$silent);
        $STATE{'word'} = $&;     # update word in STATE
        # Call callback function for any data handling.
        &$callback($&) if ($callback && defined(&$callback));
      }
      $_ = $';                   #' Goto next token
 
      # Log state changes / update current state
      $STATE{'last_word'} = $STATE{'word'};
    }
    # A character was found that is not in the grammar of the language.
    syntax_error("Invalid character in context: '".substr($_, 0, 1)."'")
                                                  if (!$found_match && $_);
  }
}
 
## Function: syntax_error(REASON)
#
## Description:
# This function will throw an error and exit the script, returning a failure
# value to the system.
#
sub syntax_error($)
{
  #print_symtable();  (DEBUGGING ONLY)
  print "% Parse error at line $STATE{'line_number'}: $_[0]\n";
  exit(1);
}
 
###
## CUSTOM PARSING AND CALLBACK FUNCTIONS
###
 
## Function: parse_identifier(ID_NAME)
#
## Description:
# The main purpose of this function is to add an identifier to the SYMTABLE
# if it does not yet exist, and increase its count value if it does. This
# function will also do basic syntax checking on variable declarations (not
# an assignment requirement).
#
sub parse_identifier($)
{
  my($match, $last) = @_;    # $last does not get set.
  if (lookup_symbol($match)) {
    # Found an existing symbol, update its counter
    # unless it was redeclared as another type, which it illegal.
    $last = $STATE{'last_word'};
    if ($last && lookup_symbol($last) &&
          $SYMTAB{$last}{'subtype'} eq 'declaration' &&
        $SYMTAB{$match}{'subtype'} ne $last &&
        $SYMTAB{$match}{'subtype'} ne 'special_entry_function')
    {
      syntax_error("variable $match (type='$SYMTAB{$match}{'subtype'}') ".
           "cannot be redeclared as type $last.");
    } 
    $SYMTAB{$match}{'count'}++;  # increase counter
  }
  else {
    # Found a new symbol, check if the prior symbol was a valid declaration
    # symbol, and add it if it was.
    $last = $STATE{'last_word'};
    if (lookup_symbol($last) && $SYMTAB{$last}{'subtype'} eq 'declaration')
    {
      print "@ Found new symbol: $match (subtype='$last', ".
            "scope=".($STATE{'scope'}>0?'local':'global').")\n";
      add_symbol($match,
                    (
                      'type' => "variable",
                      'subtype' => $last,
                      'scope' => ($STATE{'scope'} > 0 ? "local" : "global"),
                      'count' => 1
                    )
      );
    }
    else {
      # The symbol was not used in a correct context, throw an error:
      if (lookup_symbol($last)) {
        # 1. The prior symbol was not a declaration / variable type.
        syntax_error("Invalid variable declaration for ".
                     "$last $match, invalid type.");
      }
      else {
        # 2. There was no declaration symbol at all.
        syntax_error("Variable $match not previously declared.");
      }
 
    }
  }
}
 
## Function: parse_brace('{' OR '}')
#
## Description:
# This function will set the scope of the current code at the location of the
# parser. While it does support more than just a 'global' and 'local' scoping
# system, it is not implemented since the assignment used a global/local
# system.
#
sub parse_brace($)
{
  my($match) = @_;
  if ($match eq '{') { $STATE{'scope'}++; }
  if ($match eq '}') { $STATE{'scope'}--; }
}
 
## Function: parse_newline($)
#
## Description:
# Increases the line counter when a newline is encountered.
#
sub parse_newline($)
{
  $STATE{'line_number'}++;  # Go to next line
}
 
## Function: parse_bcomment(COMMENT)
#
## Description:
# Because a block comment can have newlines in it, we must account for them
# by counting how many exist and adding it to our line_number STATE value.
#
sub parse_bcomment($)
{
  # Since \n can exist without \r, but not the other way around, we only need
  # to count \n in the string.
  $STATE{'line_number'} += ($_[0] =~ tr/\n/\n/);
}
 
####
### END CODE PARSING FUNCTIONS
####