LuaFunctionRegisterSpreadsheet/GhidraParser/GhidraParser.cpp

403 lines
13 KiB
C++

#include <thread>
#include <chrono>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <queue>
#include <regex>
#include <string>
#include <unordered_map>
#include <map>
#include "GhidraParser.hpp"
std::string file_path;
bool dbg;
bool dbg_prnt_invld_only;
bool use_all = false; // 46 invalid parses
FunctionInfo fi;
std::string func;
std::smatch match;
std::string lua_type;
int lua_index;
int out_index;
std::regex lua_push_regex(R"(lua_push(.+?)\()");
std::regex usage_validity_regex(R"(Usage: .+?\((.*?)\))");
std::unordered_map<std::string, FunctionInfo> fmap;
std::string get_varmap_info(std::map<int, std::vector<std::string>> varmap)
{
if (varmap.size() == 0) return "()";
std::string str = "(";
for (auto &[key, values] : varmap)
{
for (auto &value : values)
{
str += value + "/";
}
str = str.substr(0, str.length() - 1);
str += ", ";
}
str = str.substr(0, str.length() - 2);
str += ")";
return str;
}
int parse_int(std::string input)
{
try
{
if (input.starts_with("0x")) return std::stoi(input, 0, 16);
return std::stoi(input);
}
catch (std::invalid_argument e) {}
return -1; // couldn't parse; -1 is used for dynamic return
}
void pdbg(std::string msg)
{
if (!dbg) return;
std::cout << msg << '\n';
}
void pfi(FunctionInfo& fi)
{ // print FunctionInfo
char endl = '\n';
std::cout << std::format("{:X}", fi.address) << " " << fi.func << (fi.valid_parse ? "" : " (invalid)") << endl;
if (!fi.usg.empty()) std::cout << fi.usg << endl;
if (fi.param_in_cnt == -1) std::cout << "in: dynamic" << endl;
else std::cout << "in: " << (fi.param_in_cnt == 0 ? "0" : std::to_string(fi.param_in_cnt) + " " + get_varmap_info(fi.param_in)) << endl;
if (fi.param_out_cnt == -1) std::cout << "out: dynamic" << endl;
else std::cout << "out: " << (fi.param_out_cnt == 0 ? "0" : std::to_string(fi.param_out_cnt) + " " + get_varmap_info(fi.param_out)) << endl;
for (auto &error : fi.parse_errors) std::cout << error << endl;
std::cout << std::endl;
}
bool process_varmap_regex(std::map<int, std::vector<std::string>> *varmap, std::string line, std::regex* regex)
{
bool found = false;
while (std::regex_search(line, match, *regex))
{
found = true;
lua_type = match[1];
if (lua_type == "lstring") lua_type = "string"; // lstring is a string!
if (lua_type == "fstring") lua_type = "string"; // frsting is a string!
if (regex == &lua_push_regex)
{ // push parsing uses a global index starting with 1
//if (lua_type == "nil") return false; // skipping lua_pushnil
lua_index = out_index;
out_index++;
}
else lua_index = parse_int(match[2]);
if (lua_index == -1)
{ // -1 means there wasn't a literal used for accessing the index, so i can not parse it
if (regex == &lua_push_regex) fi.param_out_cnt = -1;
else fi.param_in_cnt = -1;
return found;
}
(*varmap)[lua_index].push_back(lua_type); // always push lua type for now
//if (varmap->count(lua_index) == 0) (*varmap)[lua_index].push_back(lua_type); // new entry
//else if ((*varmap)[lua_index] != lua_type)
//{ // already got an entry, check if it has the same type
// fi.parse_errors.push_back("parameter type mismatch for index " + std::to_string(lua_index) + " - " + (*varmap)[lua_index] + " vs " + lua_type);
//}
line = match.suffix();
}
return found;
}
void chk_vld(FunctionInfo* fi)
{ // check validity of parsed info - default value is false, so i simply return if invalid
bool valid = true;
if (fi->parse_errors.size() > 0) valid = false; // any parsing errors occurred
if (fi->param_out_cnt != -1 && fi->param_out_cnt != fi->param_out.size())
{
fi->parse_errors.push_back("output param mismatch between found number of return values (" + std::to_string(fi->param_out_cnt) +") and lua_push* calls (" + std::to_string(fi->param_out.size()) + ")");
valid = false;
}
if (fi->param_in_cnt != -1 && !fi->usg.empty())
{ // do extra check against usg string
std::string usage_params;
if (std::regex_search(fi->usg, match, usage_validity_regex))
{
usage_params = match[1];
if (fi->param_in_cnt == 0 && !usage_params.empty())
{
valid = false;
fi->parse_errors.push_back("input param count does not match usage string");
}
else
{
int comma_cnt = 0;
for (auto ch : usage_params) if (ch == ',') comma_cnt++;
if (fi->param_in_cnt != comma_cnt + 1)
{
valid = false;
fi->parse_errors.push_back("input param count does not match usage string");
}
}
}
else fi->parse_errors.push_back("usage string malformed"); // if the usg string is malformed, do not consider it for validity
}
if (fi->param_in_cnt > 0)
{ // check if input param indexes are in order
for (int i = 1; i <= fi->param_in_cnt; i++)
{ // lua indexes start with 1
if (!fi->param_in.contains(i))
{
fi->parse_errors.push_back("input param index not in order");
valid = false;
break;
}
}
}
fi->valid_parse = valid;
}
void sort_prune_varmap(std::map<int, std::vector<std::string>> *varmap)
{
for (auto &[lua_index, lua_types] : *varmap)
{
lua_types.erase(std::unique(lua_types.begin(), lua_types.end()), lua_types.end());
std::sort(lua_types.begin(), lua_types.end());
}
}
__forceinline bool ln_is(std::string &ln, int idx, const char* lk_for)
{
int len = strlen(lk_for);
return ln.length() - idx > len && ln.substr(idx, len) == lk_for;
}
void smp_ln_proc(std::string &ln, int i)
{
}
int main()
{
/*
* PARSING RULES
* lua_is* calls can occur in if-lines
* lua_to* calls can occur in if-lines
* lua_push* calls can NOT occur in if-lines
* lua_push* calls are not called with an index as param, unlike lua_is* / lua_to* - the index is inferred by the order of calls
* lua_push* calls can occur in branch structures (if/else, switch) - they must only be considered in one of them
* wow functions return the number of outputs - 0 when they error or there are none; consider any non-0 return as the real number of outputs
* wow functions may return a dynamic number of outputs - there is no way to parse this into a single result
* skip scanning for the usg string if it already has been encountered in the function, there can only be 1
* skip scanning for input params, if unparsable index (not into int) has been found for lua_is* or lua_to* (dynamic inputs)
* skip scanning for output params, if unparsable index (not into int) has been found for lua_push* (dynamic outputs)
* skip scanning for output params, if returns with different non-0 values have been found (or it can not be parsed as int)
*/
if (use_all)
{
dbg = true;
dbg_prnt_invld_only = true;
file_path = R"(C:\Users\alphaomega\Documents\Wow.exe.c.txt)";
}
else
{
dbg = true;
dbg_prnt_invld_only = false;
file_path = R"(C:\Users\alphaomega\Documents\Wow.exe.c.test.txt)";
}
auto tstart = std::chrono::high_resolution_clock::now();
std::ifstream source_file(file_path);
if (!source_file.is_open())
{
std::cerr << "Unable to open file " << file_path << std::endl;
return 1;
}
//std::regex return_regex(R"(^\s+?return\s+([^;]+);$)");
//std::regex usage_regex(R"(^.+"(Usage: [^;]+)N{0}"\){0,1};$)"); // N{0} is just a workaround, since )" in the raw string would terminate it immediately
std::regex usage_regex(R"(,"(Usage: [^;]+)N{0}"[, \)])"); // N{0} is just a workaround, since )" in the raw string would terminate it immediately
//std::regex lua_is_regex(R"(lua_is(.+?)\(L,(.+?)\))");
//std::regex lua_to_regex(R"(lua_to(.+?)\(L,(.+?)[,\)])");
std::regex lua_is_regex(R"(lua_is(.+?)\()");
std::regex lua_to_regex(R"(lua_to(.+?)\()");
//bool found_return_after_push;
std::string skp_push_utl = ""; // skip push parsing until this line has been reached
std::string lp_utl = "";
std::map<int, bool> push_track;
bool in_func = false;
bool enc_usg = false; // encountered usg
uint ret_val; // return value
bool is_if_ln;
std::string ind; // indent
std::string ln; // line
while (std::getline(source_file, ln))
{
{ // check for lines which can instantly be discarded
if (ln.empty()) continue; // skip empty lines
if (ln.starts_with("/*")) continue; // skip comment lines
}
if (!in_func)
{ // searching for next function
if (ln.starts_with("// ADDRESS - "))
{ // found address ln - create new FunctionInfo
fi = {};
fi.address = parse_int(ln.substr(13));
out_index = 0;
enc_usg = false;
skp_push_utl = "";
lp_utl = "";
push_track = {};
continue;
}
if (ln.starts_with("uint lua_wow_"))
{ // found ln with function signature
fi.func = ln.substr(13, ln.find_first_of("(") - 13); // get part of real function name
in_func = true; // start function parsing from now on
continue;
}
}
else
{ // processing current function
if (ln == "}")
{ // end of function found
in_func = false;
if (fi.param_in_cnt == -1) fi.param_in.clear();
else fi.param_in_cnt = fi.param_in.size();
if (fi.param_out_cnt == -1) fi.param_out.clear();
sort_prune_varmap(&fi.param_in);
chk_vld(&fi);
fmap[fi.func] = fi;
//pdbg("# END OF " + fi.func);
continue;
}
is_if_ln = false;
int i = 0;
while (i < ln.length())
{
if (ln[i] != ' ')
{
ind = ln.substr(0, i);
break;
}
i++;
}
switch (ln[i])
{
case 'c':
case 'd':
if (ln_is(ln, i, "case ") || ln_is(ln, i, "default:"))
{
if (push_track[ind.length() / 2]) skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : ind + "}";
else push_track[ind.length() / 2] = false; // only execute if a previous case didn't already contain a push
break;
}
break;
case 'i': // check for if
if (ln_is(ln, i, "if "))
{ // enough characters left to be if statement
is_if_ln = true;
push_track[ind.length() / 2] = false;
break;
}
break;
case 'e':
if (ln_is(ln, i, "else "))
{ // enough characters left to be if statement
if (push_track[ind.length() / 2]) skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : ln[ln.length() - 1] == '{' ? ind + "}" : "\1";
break;
}
break;
case 'g':
if (!fi.param_out.empty() && ln_is(ln, i, "goto "))
{
skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : ln.substr(i + 5, ln.length() - i - 6) + ":";
break;
}
break;
case 'r': // check for return
if (fi.param_out_cnt == -1) break; // this function has a dynamic number of outputs, no need for further processing
if (ln_is(ln, i, "return "))
{ // enough characters left to be the simplest return
ret_val = parse_int(ln.substr(i + 7, ln.length() - i - 8));
// check if already encountered return value (except 0) matches; dynamic if not
if (ret_val != 0)
{ // TODO probably need to check if i am currently skipping push
fi.param_out_cnt = (fi.param_out_cnt == 0 || fi.param_out_cnt == ret_val) ? ret_val : -1;
if (ret_val == fi.param_out.size())
{ // found a return statement and return value matches output param count
skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : "\1skip2end";
}
}
break;
}
break;
}
//pdbg(ln);
//pdbg(ind + "<-");
if (!enc_usg)
{ // usage string can only occur once anyway
if (std::regex_search(ln, match, usage_regex))
{ // found usg string
fi.usg = match[1];
size_t found = -1; // so the first find uses 0 through the increment
while ((found = fi.usg.find("\\\"", found + 1)) != std::string::npos) fi.usg.replace(found, 2, "\"");
enc_usg = true;
skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : "\1"; // sometimes lua_pushfstring is used before lua_error; do not consider this an output!
}
}
if (fi.param_in_cnt != -1)
{
process_varmap_regex(&fi.param_in, ln, &lua_is_regex);
process_varmap_regex(&fi.param_in, ln, &lua_to_regex);
}
if (fi.param_out_cnt != -1 && skp_push_utl.empty())
{
if (process_varmap_regex(&fi.param_out, ln, &lua_push_regex))
{
int lvl = is_if_ln ? ind.length() / 2 : (ind.length() - 2) / 2; // do not subtract 1 block level, if this was a simple if line
while (lvl >= 1)
{ // if a push was found, track it for this and all lower block levels
push_track[lvl] = true;
lvl--;
}
}
}
if (!skp_push_utl.empty())
{
int i = 0;
}
if (skp_push_utl == ln)
skp_push_utl = ""; // reset skip since line has been reached now
if (skp_push_utl == "\1")
skp_push_utl = ""; // reset temporary skip which was used for 1 line
}
}
auto tend = std::chrono::high_resolution_clock::now();
auto duration = duration_cast<std::chrono::milliseconds>(tend - tstart);
int cnt_invalid = 0;
int cnt_total = 0;
for(auto &[name, fi] : fmap)
{
if (dbg && (!dbg_prnt_invld_only || !fi.valid_parse)) pfi(fi);
cnt_invalid += fi.valid_parse ? 0 : 1;
cnt_total++;
}
std::cout << "GhidraParser is done..." << std::endl;
std::cout << "Functions parsed: " << std::to_string(cnt_total) << std::endl;
std::cout << "Invalid parses: " << std::to_string(cnt_invalid) << std::endl;
std::cout << "duration: " << duration.count() << " ms" << std::endl;
std::cin.ignore();
return 0;
}