
# a tokenizer for the IttyBitty language
#

# the tokenize method takes one or more lines of source code,
#     and produces an array of tokens, each of the form [ type, value ]
# where the value is the actual text string for the token
#     and the token type is one of the following strings:
#     quote, text, keyword, compop, assignop, bracket, delimiter, integer, mathop, variable, unknown
# the tokenizer strips out any comments in a line before tokenizing the line
def tokenize(source)
   # first split/join the lines of text into a single array of words
   rawTokens = Array.new
   source.each_line do | line |
      # strip out any comments in the line before adding its tokens to the list
      cleanline = ""
      if line.include?("#") then
         pos = line.index("#")
         if pos > 0 then
            cleanline = line[0..(pos-1)]
         end
      else
         cleanline = line
      end
      rawTokens += cleanline.split
   end

   # now translate each token into a token-type, token-value pair
   #     and put into the tokenlist
   tokenlist = Array.new
   numTokens = 0

   # while matching tokens, keep track of whether or not we are
   #    currently inside a text literal
   inText = false

   rawTokens.each do | token |
      # see if we are entering/leaving a text literal
      if token == "\"" then
         inText =  !inText  # flip state in/out of text mode
         ttype = 'quote'
      elsif inText then
         ttype = 'text'       # everything inside " " is text
      else
         # match each token against the regular expression for the
         #    corresponding token type
         case token
             when "if", "while", "print"
                ttype = 'keyword'
             when "==", "<", "!=", ">", ">=", "<="
                ttype = 'compop'
             when "="
                ttype = 'assignop'
             when "(", ")"
                ttype = 'bracket'
             when "{", "}"
                ttype = 'delimiter'
             when /^[-]?[0-9]+$/
                ttype = 'integer'
             when "-","+", "*", "/", "%"
                ttype = 'mathop'
             when /^[a-zA-Z]+$/
                ttype = 'variable'
             else
                puts "Error: invalid token encountered: #{token}"
                ttype = 'unknown'
          end
      end

      # the token value is simply the text for that token
      tokenpair = [ ttype, token ]
      tokenlist[numTokens] = tokenpair
      numTokens += 1
   end

   # return the final list of token type/value pairs
   return tokenlist
end

