LuaFunctionRegisterSpreadsheet/GhidraParser/FunctionInfo.cpp

527 lines
15 KiB
C++
Raw Normal View History

#include "FunctionInfo.hpp"
#include "Utility.hpp"
#include <algorithm>
#include <format>
#include <optional>
#include <regex>
#include <stack>
#include <string>
#include <unordered_map>
std::regex decl_rgx(R"(^ (?:int|uint|BOOL) (\w+);)"); // currently only considering these types for declarations
std::regex ass_rgx(R"(^ +?(\w+) = (.+);)"); // OwO
std::regex if_rgx(R"(if \((.+)\) )");
std::regex usg_rgx(R"(,"(Usage: [^;]+).?"[, \)])"); // .? is a workaround to prevent raw string from closing
std::regex usg_vld_rgx(R"(Usage: .+?\((.*?)\))");
std::regex lua_is_rgx(R"(lua_is(.+?)\(L,(\w+?)\))");
std::regex lua_to_rgx(R"(lua_to(.+?)\(L,(\w+?)[,\)])");
std::regex lua_push_rgx(R"(lua_push(.+?)\(())");
std::regex dowhl_cond_rgx(R"(while \((.+)\);)");
std::optional<int> prsi_lcls(std::string str, std::unordered_map<std::string, int>& lcls)
{ // looks up local variables, before it tries to call prsi
if (lcls.count(str)) return lcls[str];
return prsi(str);
}
__forceinline std::optional<int> slv_step(std::stack<int>& vals, std::stack<Op>& ops)
{
int v2 = vals.top();
vals.pop();
int v1 = vals.top();
vals.pop();
Op op = ops.top();
ops.pop();
switch (op)
{
case LOR:
return v1 || v2;
case LAND:
return v1 && v2;
case BOR:
return v1 | v2;
case BXOR:
return v1 ^ v2;
case BAND:
return v1 & v2;
case EQ:
return v1 == v2;
case UEQ:
return v1 != v2;
case LT:
return v1 < v2;
case LTE:
return v1 <= v2;
case BT:
return v1 > v2;
case BTE:
return v1 >= v2;
case PLS:
return v1 + v2;
case MIN:
return v1 - v2;
case MUL:
return v1 * v2;
case DIV:
if (v2 == 0) return std::nullopt;
return v1 / v2;
case MOD:
if (v2 == 0) return std::nullopt;
return v1 % v2;
}
return std::nullopt; // unsupported op
}
__forceinline std::optional<int> push_op(std::stack<int>& vals, std::stack<Op>& ops, Op op)
{
while (ops.size() >= 1 && op_prec(ops.top()) >= op_prec(op))
{ // last op has greater or same precedence
auto res = slv_step(vals, ops);
if (!res.has_value()) return std::nullopt;
vals.push(res.value());
}
ops.push(op);
}
std::regex lua_is_quick(R"(^lua_is\S+?\(L,\S+?\)$)");
std::regex sstrcmpi_quick(R"(^SStrCmpI\(\S+?,\S+?,\S+?\)$)");
std::optional<int> eval(std::string& infix, locals& lcls)
{
std::smatch match;
if (std::regex_search(infix, match, lua_is_quick)) return 1;
if (std::regex_search(infix, match, sstrcmpi_quick)) return 1;
if (infix.find_first_of(' ') == std::string::npos)
{ // TODO performance escape hatch for simple infix strings - might actually be counter productive... have to check
return prsi_lcls(infix, lcls);
}
std::stack<int> vals;
std::stack<Op> ops;
int tk_start = 0;
std::string tk;
bool was_brr;
for (int tk_end = 0; tk_end < infix.length(); tk_end++)
{
was_brr = false;
switch (infix[tk_end])
{
case ' ':
tk = infix.substr(tk_start, tk_end - tk_start);
tk_start = tk_end + 1;
break;
case '(':
tk = "";
ops.push(Op::BRL);
tk_start = tk_end + 1;
continue;
case ')':
tk = infix.substr(tk_start, tk_end - tk_start);
tk_start = tk_end + 1;
was_brr = true;
break;
}
if (tk == "" && tk_end == infix.length() - 1)
{ // last token
tk = infix.substr(tk_start, tk_end - tk_start + 1);
}
if (tk == "" && !was_brr) continue; // empty token, nothing to do
if (tk == "||") push_op(vals, ops, LOR);
else if (tk == "&&") push_op(vals, ops, LAND);
else if (tk == "|") push_op(vals, ops, BOR);
else if (tk == "^") push_op(vals, ops, BXOR);
else if (tk == "&") push_op(vals, ops, BAND);
else if (tk == "==") push_op(vals, ops, EQ);
else if (tk == "!=") push_op(vals, ops, UEQ);
else if (tk == "<") push_op(vals, ops, LT);
else if (tk == "<=") push_op(vals, ops, LTE);
else if (tk == ">") push_op(vals, ops, BT);
else if (tk == ">=") push_op(vals, ops, BTE);
else if (tk == "+") push_op(vals, ops, PLS);
else if (tk == "-") push_op(vals, ops, MIN);
else if (tk == "*") push_op(vals, ops, MUL);
else if (tk == "/") push_op(vals, ops, DIV);
else if (tk == "%") push_op(vals, ops, MOD);
else if (tk != "")
{
auto pr = prsi_lcls(tk, lcls);
if (!pr.has_value()) return std::nullopt;
vals.push(pr.value());
}
if (was_brr)
{
while (ops.top() != BRL)
{
auto res = slv_step(vals, ops);
if (!res.has_value()) return std::nullopt;
vals.push(res.value());
}
ops.pop(); // popping left brace
}
tk = "";
}
while (!ops.empty())
{
auto res = slv_step(vals, ops);
if (!res.has_value()) return std::nullopt;
vals.push(res.value());
}
if (vals.size() != 1 || !ops.empty()) return std::nullopt;
return vals.top();
}
bool FunctionInfo::prc_varmap_rgx(bool prc_in, const std::string& ln_in, std::regex& rgx, locals& lcls)
{
std::string ln = ln_in;
bool fnd = false;
varmap& params = prc_in ? in : out;
while (std::regex_search(ln, match, rgx))
{
fnd = true;
int lua_idx;
std::string lua_type;
lua_type = match[1];
if (lua_type == "lstring") lua_type = "string"; // lstring is a string!
else if (lua_type == "fstring") lua_type = "string"; // frsting is a string!
if (&rgx == &lua_push_rgx)
{ // push parsing uses a global index starting with 1
lua_idx = out.size() + 1;
}
else lua_idx = prsi_lcls(match[2], lcls).value();
if (lua_idx == -1)
{ // -1 means there wasn't a literal used for accessing the index, so i can not parse it
if (&rgx == &lua_push_rgx) out_cnt = -1;
else in_cnt = -1; // TODO might be possible to parse when evaluating variables
return fnd;
}
params[lua_idx].push_back(lua_type); // always push lua type for now
ln = match.suffix();
}
return fnd;
}
void FunctionInfo::chk_vld()
{ // run all checks, so we have the full picture
bool vld = true;
if (prs_msg.size() > 0) vld = false; // parsing messages up until this point are hard errors
if (in_cnt > -1)
{ // in varmap is not dynamic
if (!usg.empty())
{ // check against usage string
if (std::regex_search(usg, match, usg_vld_rgx))
{
std::string usg_params = match[1];
int comma_cnt = 0;
for (char c : usg_params) comma_cnt += c == ',';
if (in_cnt == 0 && !usg_params.empty() || in_cnt != comma_cnt + 1)
{
prs_msg.push_back(std::format("in param count ({}) does not match usage string ({})", in_cnt, !usg_params.empty() * (comma_cnt + 1)));
vld = false;
}
}
else prs_msg.push_back("usage string is malformed"); // do not consider this for validity
}
for (int i = 1; i <= in_cnt; i++)
{ // lua indexes start with 1
if (!in.contains(i))
{
prs_msg.push_back("in param index not in order");
vld = false;
break;
}
}
}
if (out_cnt != -1 && out_cnt != out.size())
{
prs_msg.push_back(std::format("out param count ({}) does not match return value ({})", out.size(), out_cnt));
vld = false;
}
prs_vld = vld;
}
void FunctionInfo::cln_varmap(bool prc_in)
{
varmap& params = prc_in ? in : out;
int cnt = prc_in ? in_cnt : out_cnt;
if (cnt == -1)
{
params.clear(); // dynamic varmap does not need entries
return;
}
for (auto& [lua_index, lua_types] : params)
{
std::sort(lua_types.begin(), lua_types.end());
lua_types.erase(std::unique(lua_types.begin(), lua_types.end()), lua_types.end());
}
if (prc_in) in_cnt = in.size(); // input count can only be inferred by in varmap size
}
bool FunctionInfo::nil_in_varmap(bool prc_in) const
{
const varmap& params = prc_in ? in : out;
for (auto& [key, value] : params)
if (std::find(value.begin(), value.end(), "nil") != value.end())
return true;
return false;
}
std::string FunctionInfo::str() const
{
std::string str = std::format("{:X} {}{}\n", addr, nm, prs_vld ? "" : " (invalid)");
if (!usg.empty()) str += " " + usg + "\n";
str += " in: " + str_varmap(true) + "\n";
str += " out: " + str_varmap(false);
for (const std::string& err : prs_msg) str += "\n " + err;
return str;
}
std::string FunctionInfo::str_varmap(bool prc_in) const
{
int cnt = prc_in ? in_cnt : out_cnt;
switch (cnt)
{
case -1:
return "dynamic";
case 0:
return "0 ()";
}
const varmap& params = prc_in ? in : out;
if (params.size() > 0)
{ // cnt and params.size() might differ - lua_push* calls can be undetected
std::string str = std::to_string(cnt) + " (";
for (const auto& [lua_index, lua_types] : params)
{
for (const auto& lua_type : lua_types) str += lua_type + "/";
str.pop_back(); // remove last slash
str += ", ";
}
str[str.length() - 2] = ')'; // replace last comma with closing bracket
str.pop_back(); // remove space after last comma
return str;
}
return std::to_string(cnt) + " ()";
}
FunctionInfo::FunctionInfo()
{
}
FunctionInfo::FunctionInfo(std::vector<std::string> src)
{
addr = std::stoi(&src[0][13], 0, 16); // no use of prsi - values occur once -> no caching wanted
nm = src[1].substr(13, src[1].find_first_of('(') - 13);
std::string ind; // indentation - keeping track of current block level
std::string lp_utl = ""; // currently in a loop until this line is reached
std::string skp_ass_utl = "";
std::string skp_push_utl = ""; // skip push parsing until this ln has been reached
std::string cond;
std::optional<int> er; // eval result
bool is_if_ln;
bool prc_decl = true; // process variable declarations
bool enc_usg = false; // encountered usg
uint ret_val; // return value
std::string infix;
lp_track lp_track; // keeps track where the loop started
std::unordered_map<std::string, bool> push_track; // keeps track if on this indent level a push has happened
locals lcls; // local variables
for (int idx = 3; idx < src.size(); idx++) // skip right to the lines which matter
{
std::string& ln = src[idx];
// reset line tracking variables
is_if_ln = false;
if (prc_decl)
{ // if local variable definition parsing is enabled
if (ln == " ")
{ // reached end variable definition block, no further processing required
prc_decl = false;
continue;
}
if (std::regex_search(ln, match, decl_rgx)) lcls[match[1]] = 0; // track with init value 0
continue; // no need to do further processing
}
int i = 0;
while (i < ln.length())
{
if (ln[i] != ' ')
{
ind = ln.substr(0, i);
break;
}
i++;
}
switch (ln[i])
{
case '}':
if (ln == ind + '}')
{ // block end, maybe simple while or for loop end
if (lp_track.find(ind) != lp_track.end())
lp_track.erase(ind); // reached loop end
break;
}
if (std::regex_search(ln, match, dowhl_cond_rgx))
{
lp_track[ind].iter--;
if (lp_track[ind].iter < 0)
{ // max iterations exceeded
prs_msg.push_back(std::format("max iterations exceeded in: {}", idx + 1));
}
else
{
infix = match[1];
auto er = eval(infix, lcls);
if (er.has_value() && er.value() != 0)
{ // condition was parsable and evaluated to true
idx = lp_track[ind].idx;
continue;
}
}
if (lp_track.find(ind) != lp_track.end()) lp_track.erase(ind);
break;
}
break;
case 'c':
case 'd':
if (lnsw(ln, i, "do {"))
{ // setting loop end, if not already in a loop
lp_track[ind] = { idx + (lnew(ln, "{")) - 1, MAX_ITER};
break;
}
if (lnsw(ln, i, "case ") || lnsw(ln, i, "default:"))
{
if (push_track[ind]) ssie(skp_push_utl, ind + "}");
else push_track[ind] = false; // only execute if a previous case didn't already contain a push
break;
}
break;
case 'f':
if (lnsw(ln, i, "for ("))
{
lp_track[ind] = { idx + (lnew(ln, "{")) - 1, MAX_ITER};
break;
}
break;
case 'i': // check for if
if (lnsw(ln, i, "if "))
{ // enough characters left to be if statement
is_if_ln = true;
break;
}
break;
case 'e':
if (lnsw(ln, i, "else "))
{ // enough characters left to be if statement
if (push_track[ind])
{
ssie(skp_ass_utl, lnew(ln, "{") ? ind + "}" : "\1");
ssie(skp_push_utl, lnew(ln, "{") ? ind + "}" : "\1");
}
else if (lnsw(ln, i + 5, "if ")) is_if_ln = true;
break;
}
break;
case 'g':
if (!out.empty() && lnsw(ln, i, "goto "))
{
ssie(skp_push_utl, std::string(ln.substr(i + 5, ln.length() - i - 6)) + ":");
break;
}
break;
case 'r': // check for return
if (out_cnt == -1) break; // this function has a dynamic number of outputs, no need for further processing
if (lnsw(ln, i, "return "))
{ // enough characters left to be the simplest return
cond = std::string(&ln[i + 7], ln.length() - i - 8);
er = eval(cond, lcls);
if (er.has_value()) ret_val = er.value();
else ret_val = -1;
// check if already encountered return value (except 0) matches; dynamic if not
if (ret_val != 0)
{ // TODO probably need to check if i am currently skipping push
out_cnt = (out_cnt == 0 || out_cnt == ret_val) ? ret_val : -1;
if (ret_val == out.size())
{ // found a return statement and return value matches output param count
ssie(skp_push_utl, "\1skip2end");
}
}
break;
}
break;
case 'w':
if (lnsw(ln, i, "while ("))
{ // setting loop end, if not already in a loop
lp_track[ind] = { idx + (lnew(ln, "{")) - 1, MAX_ITER};
break;
}
break;
}
if (is_if_ln)
{
std::regex_search(ln, match, if_rgx);
cond = match[1];
er = eval(cond, lcls);
if(!er.has_value() || er.value() != 0) push_track[ind] = true;
else
{
push_track[ind] = false;
skp_ass_utl = lnew(ln, "{") ? ind + "}" : "\1";
skp_push_utl = lnew(ln, "{") ? ind + "}" : "\1";
}
}
if (skp_ass_utl.empty() && std::regex_search(ln, match, ass_rgx) && lcls.count(match[1])) // only process assignment for variables i still care about
{ // assignment regex matched and local variable with the name is tracked
infix = match[2];
auto er = eval(infix, lcls);
if (er.has_value()) lcls[match[1]] = er.value();
else lcls.erase(match[1]);
}
if (!enc_usg)
{ // usage string can only occur once anyway
//if (auto match = ctre::match<usage_regex>(ln))
if (std::regex_search(ln, match, usg_rgx))
{ // found usg string
usg = match[1];
size_t found = -1; // so the first find uses 0 through the increment
while ((found = usg.find("\\\"", found + 1)) != std::string::npos) usg.replace(found, 2, "\"");
enc_usg = true;
ssie(skp_push_utl, "\1"); // sometimes lua_pushfstring is used before lua_error; do not consider this an output!
}
}
if (in_cnt != -1)
{
prc_varmap_rgx(true, ln, lua_is_rgx, lcls);
prc_varmap_rgx(true, ln, lua_to_rgx, lcls);
}
if (out_cnt != -1 && skp_push_utl.empty() && prc_varmap_rgx(false, ln, lua_push_rgx, lcls))
{
std::string lvl = is_if_ln ? ind : ind.substr(0, ind.length() - 2); // do not subtract 1 block level, if this was a simple if ln
while (lvl >= " ")
{ // if a push was found, track it for this and all lower block levels
push_track[lvl] = true;
lvl = lvl.substr(0, lvl.length() - 2);
}
}
if (!lp_utl.empty() && ln.starts_with(lp_utl)) lp_utl = "";
if (lp_utl == "\1") lp_utl = "";
if (!skp_ass_utl.empty() && ln.starts_with(skp_ass_utl)) skp_ass_utl = "";
if (skp_ass_utl == "\1") skp_ass_utl = "";
if (!skp_push_utl.empty() && ln.starts_with(skp_push_utl)) skp_push_utl = ""; // reset skip since ln has been reached now
if (skp_push_utl == "\1") skp_push_utl = ""; // reset temporary skip which was used for 1 ln
}
cln_varmap(true); // clean input varmap
cln_varmap(false); // clean output varmap
chk_vld(); // check validity of parsed data
int i = 0;
}