LuaFunctionRegisterSpreadsheet/GhidraParser/GhidraParser.cpp

356 lines
11 KiB
C++

#include "FunctionInfo.hpp"
#include "Utility.hpp"
#include <thread>
#include <chrono>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <queue>
#include <regex>
#include <string>
#include <string_view>
#include <unordered_map>
#include <map>
std::string fl_path;
bool dbg;
bool ivld_only;
bool use_all = true;
/*
* 907 cln
* 25 ivld
*/
/*
* 1742 cln
* 123 ivld
*/
#define PL(msg) std::cout << msg << '\n' // print line
#define PDBG(msg) if(!dbg) PL(msg) // print line if debug flag is set
std::smatch match1; // TODO remove later
std::regex usage_regex(R"(,"(Usage: [^;]+).?"[, \)])"); // .? is just a workaround, since )" in the raw string would terminate it immediately
std::regex lua_push_regex(R"(lua_push(.+?)\(())");
std::regex lua_is_regex(R"(lua_is(.+?)\(L,(\w+?)\))");
std::regex lua_to_regex(R"(lua_to(.+?)\(L,(\w+?)[,\)])");
std::regex var_decl_regex(R"(^ (?:int|uint|BOOL) (\w+);)"); // currently only considering these types for declarations
std::regex var_ass_regex(R"(^ +?(\w+?) = (\w+) ?(.)? ?(\w+?){0,1};)");
FunctionInfo fi;
int lua_index;
std::string lua_type;
int out_index;
std::unordered_map<std::string, FunctionInfo> fmap;
std::chrono::steady_clock::time_point tstart;
std::chrono::steady_clock::time_point tend;
bool process_varmap_regex(std::map<int, std::vector<std::string>>& varmap, std::string ln, std::regex* regex)
{
bool found = false;
while (std::regex_search(ln, match1, *regex))
{
found = true;
lua_type = match1[1];
if (lua_type == "lstring") lua_type = "string"; // lstring is a string!
else if (lua_type == "fstring") lua_type = "string"; // frsting is a string!
if (regex == &lua_push_regex)
{ // push parsing uses a global index starting with 1
lua_index = out_index;
out_index++;
}
else lua_index = prsi(match1[2]).val;
if (lua_index == -1)
{ // -1 means there wasn't a literal used for accessing the index, so i can not parse it
if (regex == &lua_push_regex) fi.out_cnt = -1;
else fi.in_cnt = -1; // TODO might be possible to parse when evaluating variables
return found;
}
varmap[lua_index].push_back(lua_type); // always push lua type for now
ln = match1.suffix();
}
return found;
}
void pfmap(bool ivld_only)
{
for (const auto& [name, fi] : fmap)
{
if (!ivld_only || !fi.prs_vld)
PL(fi.str() + "\n");
}
}
void pstats()
{ // print parsing statistics
int ttl = 0;
int cln = 0;
int ivld = 0;
for (const auto& [name, fi] : fmap)
{
ttl++;
cln += fi.prs_vld && fi.in_cnt != -1 && fi.out_cnt != -1 && !fi.nil_in_varmap(false);
ivld += !fi.prs_vld;
}
auto dur = duration_cast<std::chrono::milliseconds>(tend - tstart);
PL(std::format("Functions parsed: {}", ttl));
PL(std::format("Clean: {}", cln));
PL(std::format("Invalid: {}", ivld));
PL(std::format("Duration: {} ms", dur.count()));
}
int main()
{
/*
* PARSING RULES
* lua_is* calls can occur in if-lines
* lua_to* calls can occur in if-lines
* lua_push* calls can NOT occur in if-lines
* lua_push* calls are not called with an index as param, unlike lua_is* / lua_to* - the index is inferred by the order of calls
* lua_push* calls can occur in branch structures (if/else, switch) - they must only be considered in one of them
* wow functions return the number of outputs - 0 when they error or there are none; consider any non-0 return as the real number of outputs
* wow functions may return a dynamic number of outputs - there is no way to parse this into a single result
* skip scanning for the usg string if it already has been encountered in the function, there can only be 1
* skip scanning for input params, if unparsable index (not into int) has been found for lua_is* or lua_to* (dynamic inputs)
* skip scanning for output params, if unparsable index (not into int) has been found for lua_push* (dynamic outputs)
* skip scanning for output params, if returns with different non-0 values have been found (or it can not be parsed as int)
*/
tstart = std::chrono::high_resolution_clock::now();
if (use_all)
{
dbg = true;
ivld_only = true;
fl_path = R"(C:\Users\alphaomega\Documents\Wow.exe.c.txt)";
}
else
{
dbg = true;
ivld_only = false;
fl_path = R"(C:\Users\alphaomega\Documents\Wow.exe.c.test.txt)";
}
std::ifstream source_file(fl_path);
if (!source_file.is_open())
{
std::cerr << "Unable to open file " << fl_path << std::endl;
return 1;
}
std::unordered_map<std::string, int> func_vars;
std::string skp_push_utl = ""; // skip push parsing until this ln has been reached
std::string lp_utl = "";
std::unordered_map<std::string, bool> push_track;
bool in_func = false;
bool enc_usg = false; // encountered usg
bool prc_def = false;
uint ret_val; // return value
bool is_if_ln;
std::string ind; // indent
std::string ln;
std::vector<char>::iterator eol;
while (std::getline(source_file, ln))
{
if (ln.empty()) continue; // skip empty lines
if (ln.starts_with("/*")) continue; // skip block comment lines
if (!in_func)
{ // searching for next function
if (ln.starts_with("// ADDRESS - "))
{ // found addr ln - create new FunctionInfo and reset variables
fi = {};
fi.addr = std::stoi(&ln[13], 0, 16); // do not use prsi, these values prb occur only once, so no caching wanted
out_index = 0;
enc_usg = false;
lp_utl = "";
skp_push_utl = "";
push_track = {};
func_vars = {};
continue;
}
if (ln.starts_with("uint lua_wow_"))
{ // found ln with function signature
fi.nm = ln.substr(13, ln.find_first_of("(") - 13); // get part of real function name
in_func = true; // start function parsing from now on
continue;
}
}
else
{ // processing current function
is_if_ln = false;
if (ln == "{")
{ // this is the starting block of the function
prc_def = true; // enable regex handling for local variable definitions
continue;
}
if (ln == "}")
{ // end of function found
in_func = false;
fi.cln_varmap(true);
fi.cln_varmap(false);
fi.chk_vld();
fmap[fi.nm] = fi;
continue;
}
if (prc_def)
{ // if local variable definition parsing is enabled
if (ln == " ")
{ // reached end variable definition block, no further processing required
prc_def = false;
continue;
}
if (std::regex_search(ln, match1, var_decl_regex))
{ // found local variable, track it with init value 0
func_vars[match1[1]] = 0;
}
}
int i = 0;
while (i < ln.length())
{
if (ln[i] != ' ')
{
ind = ln.substr(0, i);
break;
}
i++;
}
switch (ln[i])
{
case 'c':
case 'd':
if (lnsw(ln, i, "do {"))
{ // setting loop end, if not already in a loop
ssie(lp_utl, ln[ln.length() - 1] == '{' ? ind + "}" : "\1"); // check if this is a single ln do-while loop (who would even program such a thing!?
break;
}
if (lnsw(ln, i, "case ") || lnsw(ln, i, "default:"))
{
if (push_track[ind]) ssie(skp_push_utl, ind + "}");
else push_track[ind] = false; // only execute if a previous case didn't already contain a push
break;
}
break;
case 'f':
if (lnsw(ln, i, "for ("))
{
ssie(lp_utl, ln[ln.length() - 1] == '{' ? ind + "}" : "\1"); // check if this is a single ln for loop
break;
}
break;
case 'i': // check for if
if (lnsw(ln, i, "if "))
{ // enough characters left to be if statement
is_if_ln = true;
push_track[ind] = false;
break;
}
break;
case 'e':
if (lnsw(ln, i, "else "))
{ // enough characters left to be if statement
if (push_track[ind]) ssie(skp_push_utl, ln[ln.length() - 1] == '{' ? ind + "}" : "\1");
break;
}
break;
case 'g':
if (!fi.out.empty() && lnsw(ln, i, "goto "))
{
ssie(skp_push_utl, std::string(ln.substr(i + 5, ln.length() - i - 6)) + ":");
break;
}
break;
case 'r': // check for return
if (fi.out_cnt == -1) break; // this function has a dynamic number of outputs, no need for further processing
if (lnsw(ln, i, "return "))
{ // enough characters left to be the simplest return
ret_val = prsi(std::string(&ln[i + 7], ln.length() - i - 8)).val;
// check if already encountered return value (except 0) matches; dynamic if not
if (ret_val != 0)
{ // TODO probably need to check if i am currently skipping push
fi.out_cnt = (fi.out_cnt == 0 || fi.out_cnt == ret_val) ? ret_val : -1;
if (ret_val == fi.out.size())
{ // found a return statement and return value matches output param count
ssie(skp_push_utl, "\1skip2end");
}
}
break;
}
break;
case 'w':
if (lnsw(ln, i, "while ("))
{ // setting loop end, if not already in a loop
ssie(lp_utl, ln[ln.length() - 1] == '{' ? ind + "}" : "\1"); // check if this is a single ln while loop
break;
}
break;
}
//pdbg(ln);
//pdbg(ind + "<-");
//if (auto match = ctre::match<var_ass_regex>(ln))
if (std::regex_search(ln, match1, var_ass_regex))
{
int i = 0;
//auto test = match.get<1>();
//pdbg("variable assignment happens for " + match.get<1>().to_string());
}
if (!enc_usg)
{ // usage string can only occur once anyway
//if (auto match = ctre::match<usage_regex>(ln))
if (std::regex_search(ln, match1, usage_regex))
{ // found usg string
fi.usg = match1[1];
size_t found = -1; // so the first find uses 0 through the increment
while ((found = fi.usg.find("\\\"", found + 1)) != std::string::npos) fi.usg.replace(found, 2, "\"");
enc_usg = true;
skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : "\1"; // sometimes lua_pushfstring is used before lua_error; do not consider this an output!
}
}
if (fi.in_cnt != -1)
{
process_varmap_regex(fi.in, ln, &lua_is_regex);
process_varmap_regex(fi.in, ln, &lua_to_regex);
}
if (fi.out_cnt != -1 && skp_push_utl.empty() && process_varmap_regex(fi.out, ln, &lua_push_regex))
{
if (!lp_utl.empty())
{ // found lua_push* inside a loop
fi.out_cnt = -1;
}
else
{ // normal processing
std::string lvl = is_if_ln ? ind : ind.substr(0, ind.length() - 2); // do not subtract 1 block level, if this was a simple if ln
while (lvl >= " ")
{ // if a push was found, track it for this and all lower block levels
push_track[lvl] = true;
lvl = lvl.substr(0, lvl.length() - 2);
}
}
}
if (!lp_utl.empty() && ln.starts_with(lp_utl)) lp_utl = "";
if (lp_utl == "\1") lp_utl = "";
if (!skp_push_utl.empty() && ln.starts_with(skp_push_utl)) skp_push_utl = ""; // reset skip since ln has been reached now
if (skp_push_utl == "\1") skp_push_utl = ""; // reset temporary skip which was used for 1 ln
}
}
tend = std::chrono::high_resolution_clock::now();
if (dbg) pfmap(ivld_only);
std::cout << "GhidraParser is done..." << std::endl;
pstats();
std::cin.ignore();
return 0;
}