-- This Script is Part of the Prometheus Obfuscator by levno-710 -- -- tokenizer.lua -- Overview: -- This Script provides a class for lexical Analysis of lua code. -- This Tokenizer is Capable of tokenizing LuaU and Lua5.1 local Enums = require("prometheus.enums"); local util = require("prometheus.util"); local logger = require("logger"); local config = require("config"); local LuaVersion = Enums.LuaVersion; local lookupify = util.lookupify; local unlookupify = util.unlookupify; local escape = util.escape; local chararray = util.chararray; local keys = util.keys; local Tokenizer = {}; Tokenizer.EOF_CHAR = ""; Tokenizer.WHITESPACE_CHARS = lookupify{ " ", "\t", "\n", "\r", } Tokenizer.ANNOTATION_CHARS = lookupify(chararray("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_")) Tokenizer.ANNOTATION_START_CHARS = lookupify(chararray("!@")) Tokenizer.Conventions = Enums.Conventions; Tokenizer.TokenKind = { Eof = "Eof", Keyword = "Keyword", Symbol = "Symbol", Ident = "Identifier", Number = "Number", String = "String", } Tokenizer.EOF_TOKEN = { kind = Tokenizer.TokenKind.Eof, value = "", startPos = -1, endPos = -1, source = "", } local function token(self, startPos, kind, value) local line, linePos = self:getPosition(self.index); local annotations = self.annotations self.annotations = {}; return { kind = kind, value = value, startPos = startPos, endPos = self.index, source = self.source:sub(startPos + 1, self.index), line = line, linePos = linePos, annotations = annotations, } end local function generateError(self, message) local line, linePos = self:getPosition(self.index); return "Lexing Error at Position " .. tostring(line) .. ":" .. tostring(linePos) .. ", " .. message; end local function generateWarning(token, message) return "Warning at Position " .. tostring(token.line) .. ":" .. tostring(token.linePos) .. ", " .. message; end function Tokenizer:getPosition(i) local column = self.columnMap[i] if not column then --// `i` is bigger than self.length, this shouldn't happen, but it did. (Theres probably some error in the tokenizer, cant find it.) column = self.columnMap[#self.columnMap] end return column.id, column.charMap[i] end --// Prepare columnMap for getPosition function Tokenizer:prepareGetPosition() local columnMap, column = {}, { charMap = {}, id = 1, length = 0 } for index = 1, self.length do local character = string.sub(self.source, index, index) -- NOTE_1: this could use table.clone to reduce amount of NEWTABLE (if that causes any performance issues) local columnLength = column.length + 1 column.length = columnLength column.charMap[index] = columnLength if character == "\n" then column = { charMap = {}, id = column.id + 1, length = 0 } -- NOTE_1 end columnMap[index] = column end self.columnMap = columnMap end -- Constructor for Tokenizer function Tokenizer:new(settings) local luaVersion = (settings and (settings.luaVersion or settings.LuaVersion)) or LuaVersion.LuaU; local conventions = Tokenizer.Conventions[luaVersion]; if(conventions == nil) then logger:error("The Lua Version \"" .. luaVersion .. "\" is not recognized by the Tokenizer! Please use one of the following: \"" .. table.concat(keys(Tokenizer.Conventions), "\",\"") .. "\""); end local tokenizer = { index = 0, -- Index where the current char is read length = 0, source = "", -- Source to Tokenize luaVersion = luaVersion, -- LuaVersion to be used while Tokenizing conventions = conventions; NumberChars = conventions.NumberChars, NumberCharsLookup = lookupify(conventions.NumberChars), Keywords = conventions.Keywords, KeywordsLookup = lookupify(conventions.Keywords), BinaryNumberChars = conventions.BinaryNumberChars, BinaryNumberCharsLookup = lookupify(conventions.BinaryNumberChars); BinaryNums = conventions.BinaryNums, HexadecimalNums = conventions.HexadecimalNums, HexNumberChars = conventions.HexNumberChars, HexNumberCharsLookup = lookupify(conventions.HexNumberChars), DecimalExponent = conventions.DecimalExponent, DecimalSeperators = conventions.DecimalSeperators, IdentChars = conventions.IdentChars, IdentCharsLookup = lookupify(conventions.IdentChars), EscapeSequences = conventions.EscapeSequences, NumericalEscapes = conventions.NumericalEscapes, EscapeZIgnoreNextWhitespace = conventions.EscapeZIgnoreNextWhitespace, HexEscapes = conventions.HexEscapes, UnicodeEscapes = conventions.UnicodeEscapes, SymbolChars = conventions.SymbolChars, SymbolCharsLookup = lookupify(conventions.SymbolChars), MaxSymbolLength = conventions.MaxSymbolLength, Symbols = conventions.Symbols, SymbolsLookup = lookupify(conventions.Symbols), StringStartLookup = lookupify({"\"", "\'"}), annotations = {}, }; setmetatable(tokenizer, self); self.__index = self; return tokenizer; end -- Reset State of Tokenizer to Tokenize another File function Tokenizer:reset() self.index = 0; self.length = 0; self.source = ""; self.annotations = {}; self.columnMap = {}; end -- Append String to this Tokenizer function Tokenizer:append(code) self.source = self.source .. code self.length = self.length + code:len(); self:prepareGetPosition(); end -- Function to peek the n'th char in the source of the tokenizer local function peek(self, n) n = n or 0; local i = self.index + n + 1; if i > self.length then return Tokenizer.EOF_CHAR end return self.source:sub(i, i); end -- Function to get the next char in the source local function get(self) local i = self.index + 1; if i > self.length then logger:error(generateError(self, "Unexpected end of Input")); end self.index = self.index + 1; return self.source:sub(i, i); end -- The same as get except it throws an Error if the char is not contained in charOrLookup local function expect(self, charOrLookup) if(type(charOrLookup) == "string") then charOrLookup = {[charOrLookup] = true}; end local char = peek(self); if charOrLookup[char] ~= true then local etb = unlookupify(charOrLookup); for i, v in ipairs(etb) do etb[i] = escape(v); end local errorMessage = "Unexpected char \"" .. escape(char) .. "\"! Expected one of \"" .. table.concat(etb, "\",\"") .. "\""; logger:error(generateError(self, errorMessage)); end self.index = self.index + 1; return char; end -- Returns wether the n'th char is in the lookup local function is(self, charOrLookup, n) local char = peek(self, n); if(type(charOrLookup) == "string") then return char == charOrLookup; end return charOrLookup[char]; end function Tokenizer:parseAnnotation() if is(self, Tokenizer.ANNOTATION_START_CHARS) then self.index = self.index + 1; local source, length = {}, 0; while(is(self, Tokenizer.ANNOTATION_CHARS)) do source[length + 1] = get(self) length = #source end if length > 0 then self.annotations[string.lower(table.concat(source))] = true; end return nil; end return get(self); end -- skip one or 0 Comments and return wether one was found function Tokenizer:skipComment() if(is(self, "-", 0) and is(self, "-", 1)) then self.index = self.index + 2; if(is(self, "[")) then self.index = self.index + 1; local eqCount = 0; while(is(self, "=")) do self.index = self.index + 1; eqCount = eqCount + 1; end if(is(self, "[")) then -- Multiline Comment -- Get all Chars to Closing bracket but also consider that the count of equal signs must be the same while true do if(self:parseAnnotation() == ']') then local eqCount2 = 0; while(is(self, "=")) do self.index = self.index + 1; eqCount2 = eqCount2 + 1; end if(is(self, "]")) then if(eqCount2 == eqCount) then self.index = self.index + 1; return true end end end end end end -- Single Line Comment -- Get all Chars to next Newline while(self.index < self.length and self:parseAnnotation() ~= "\n") do end return true; end return false; end -- skip All Whitespace and Comments to next Token function Tokenizer:skipWhitespaceAndComments() while self:skipComment() do end while is(self, Tokenizer.WHITESPACE_CHARS) do self.index = self.index + 1; while self:skipComment() do end end end local function int(self, chars, seperators) local buffer = {}; while true do if (is(self, chars)) then buffer[#buffer + 1] = get(self) elseif (is(self, seperators)) then self.index = self.index + 1; else break end end return table.concat(buffer); end -- Lex the next token as a Number function Tokenizer:number() local startPos = self.index; local source = expect(self, setmetatable({["."] = true}, {__index = self.NumberCharsLookup})); if source == "0" then if self.BinaryNums and is(self, lookupify(self.BinaryNums)) then self.index = self.index + 1; source = int(self, self.BinaryNumberCharsLookup, lookupify(self.DecimalSeperators or {})); local value = tonumber(source, 2); return token(self, startPos, Tokenizer.TokenKind.Number, value); end if self.HexadecimalNums and is(self, lookupify(self.HexadecimalNums)) then self.index = self.index + 1; source = int(self, self.HexNumberCharsLookup, lookupify(self.DecimalSeperators or {})); local value = tonumber(source, 16); return token(self, startPos, Tokenizer.TokenKind.Number, value); end end if source == "." then source = source .. int(self, self.NumberCharsLookup, lookupify(self.DecimalSeperators or {})); else source = source .. int(self, self.NumberCharsLookup, lookupify(self.DecimalSeperators or {})); if(is(self, ".")) then source = source .. get(self) .. int(self, self.NumberCharsLookup, lookupify(self.DecimalSeperators or {})); end end if(self.DecimalExponent and is(self, lookupify(self.DecimalExponent))) then source = source .. get(self); if(is(self, lookupify({"+","-"}))) then source = source .. get(self); end local v = int(self, self.NumberCharsLookup, lookupify(self.DecimalSeperators or {})); if(v:len() < 1) then logger:error(generateError(self, "Expected a Valid Exponent!")); end source = source .. v; end local value = tonumber(source); return token(self, startPos, Tokenizer.TokenKind.Number, value); end -- Lex the Next Token as Identifier or Keyword function Tokenizer:ident() local startPos = self.index; local source = expect(self, self.IdentCharsLookup) local sourceAddContent = {source} while(is(self, self.IdentCharsLookup)) do -- source = source .. get(self); table.insert(sourceAddContent, get(self)) end source = table.concat(sourceAddContent) if(self.KeywordsLookup[source]) then return token(self, startPos, Tokenizer.TokenKind.Keyword, source); end local tk = token(self, startPos, Tokenizer.TokenKind.Ident, source); if(string.sub(source, 1, string.len(config.IdentPrefix)) == config.IdentPrefix) then logger:warn(generateWarning(tk, string.format("identifiers should not start with \"%s\" as this may break the program", config.IdentPrefix))); end return tk; end function Tokenizer:singleLineString() local startPos = self.index; local startChar = expect(self, self.StringStartLookup); local buffer = {}; while (not is(self, startChar)) do local char = get(self); -- Single Line String may not contain Linebreaks except when they are escaped by \ if(char == '\n') then self.index = self.index - 1; logger:error(generateError(self, "Unterminated String")); end if(char == "\\") then char = get(self); local escape = self.EscapeSequences[char]; if(type(escape) == "string") then char = escape; elseif(self.NumericalEscapes and self.NumberCharsLookup[char]) then local numstr = char; if(is(self, self.NumberCharsLookup)) then char = get(self); numstr = numstr .. char; end if(is(self, self.NumberCharsLookup)) then char = get(self); numstr = numstr .. char; end char = string.char(tonumber(numstr)); elseif(self.UnicodeEscapes and char == "u") then expect(self, "{"); local num = ""; while (is(self, self.HexNumberCharsLookup)) do num = num .. get(self); end expect(self, "}"); char = util.utf8char(tonumber(num, 16)); elseif(self.HexEscapes and char == "x") then local hex = expect(self, self.HexNumberCharsLookup) .. expect(self, self.HexNumberCharsLookup); char = string.char(tonumber(hex, 16)); elseif(self.EscapeZIgnoreNextWhitespace and char == "z") then char = ""; while(is(self, Tokenizer.WHITESPACE_CHARS)) do self.index = self.index + 1; end end end --// since table.insert is slower in lua51 buffer[#buffer + 1] = char end expect(self, startChar); return token(self, startPos, Tokenizer.TokenKind.String, table.concat(buffer)) end function Tokenizer:multiLineString() local startPos = self.index; if(is(self, "[")) then self.index = self.index + 1; local eqCount = 0; while(is(self, "=")) do self.index = self.index + 1; eqCount = eqCount + 1; end if(is(self, "[")) then -- Multiline String -- Parse String to Closing bracket but also consider that the count of equal signs must be the same -- Skip Leading newline if existing self.index = self.index + 1; if(is(self, "\n")) then self.index = self.index + 1; end local value = ""; while true do local char = get(self); if(char == ']') then local eqCount2 = 0; while(is(self, "=")) do char = char .. get(self); eqCount2 = eqCount2 + 1; end if(is(self, "]")) then if(eqCount2 == eqCount) then self.index = self.index + 1; return token(self, startPos, Tokenizer.TokenKind.String, value), true end end end value = value .. char; end end end self.index = startPos; return nil, false -- There was not an actual multiline string at the given Position end function Tokenizer:symbol() local startPos = self.index; for len = self.MaxSymbolLength, 1, -1 do local str = self.source:sub(self.index + 1, self.index + len); if self.SymbolsLookup[str] then self.index = self.index + len; return token(self, startPos, Tokenizer.TokenKind.Symbol, str); end end logger:error(generateError(self, "Unknown Symbol")); end -- get the Next token function Tokenizer:next() -- Skip All Whitespace before the token self:skipWhitespaceAndComments(); local startPos = self.index; if startPos >= self.length then return token(self, startPos, Tokenizer.TokenKind.Eof); end -- Numbers if(is(self, self.NumberCharsLookup)) then return self:number(); end -- Identifiers and Keywords if(is(self, self.IdentCharsLookup)) then return self:ident(); end -- Singleline String Literals if(is(self, self.StringStartLookup)) then return self:singleLineString(); end -- Multiline String Literals if(is(self, "[", 0)) then -- The isString variable is due to the fact that "[" could also be a symbol for indexing local value, isString = self:multiLineString(); if isString then return value; end end -- Number starting with dot if(is(self, ".") and is(self, self.NumberCharsLookup, 1)) then return self:number(); end -- Symbols if(is(self, self.SymbolCharsLookup)) then return self:symbol(); end logger:error(generateError(self, "Unexpected char \"" .. escape(peek(self)) .. "\"!")); end function Tokenizer:scanAll() local tb = {}; repeat local token = self:next(); table.insert(tb, token); until token.kind == Tokenizer.TokenKind.Eof return tb end return Tokenizer