#include #include #include #include #include #include #include #include #include #include #include "GhidraParser.hpp" std::string file_path; bool dbg; bool dbg_prnt_invld_only; bool use_all = false; // 46 invalid parses FunctionInfo fi; std::string func; std::smatch match; std::string lua_type; int lua_index; int out_index; std::regex lua_push_regex(R"(lua_push(.+?)\()"); std::regex usage_validity_regex(R"(Usage: .+?\((.*?)\))"); std::unordered_map fmap; std::string get_varmap_info(std::map> varmap) { if (varmap.size() == 0) return "()"; std::string str = "("; for (auto &[key, values] : varmap) { for (auto &value : values) { str += value + "/"; } str = str.substr(0, str.length() - 1); str += ", "; } str = str.substr(0, str.length() - 2); str += ")"; return str; } int parse_int(std::string input) { try { if (input.starts_with("0x")) return std::stoi(input, 0, 16); return std::stoi(input); } catch (std::invalid_argument e) {} return -1; // couldn't parse; -1 is used for dynamic return } void pdbg(std::string msg) { if (!dbg) return; std::cout << msg << '\n'; } void pfi(FunctionInfo& fi) { // print FunctionInfo char endl = '\n'; std::cout << std::format("{:X}", fi.address) << " " << fi.func << (fi.valid_parse ? "" : " (invalid)") << endl; if (!fi.usg.empty()) std::cout << fi.usg << endl; if (fi.param_in_cnt == -1) std::cout << "in: dynamic" << endl; else std::cout << "in: " << (fi.param_in_cnt == 0 ? "0" : std::to_string(fi.param_in_cnt) + " " + get_varmap_info(fi.param_in)) << endl; if (fi.param_out_cnt == -1) std::cout << "out: dynamic" << endl; else std::cout << "out: " << (fi.param_out_cnt == 0 ? "0" : std::to_string(fi.param_out_cnt) + " " + get_varmap_info(fi.param_out)) << endl; for (auto &error : fi.parse_errors) std::cout << error << endl; std::cout << std::endl; } bool process_varmap_regex(std::map> *varmap, std::string line, std::regex* regex) { bool found = false; while (std::regex_search(line, match, *regex)) { found = true; lua_type = match[1]; if (lua_type == "lstring") lua_type = "string"; // lstring is a string! if (lua_type == "fstring") lua_type = "string"; // frsting is a string! if (regex == &lua_push_regex) { // push parsing uses a global index starting with 1 //if (lua_type == "nil") return false; // skipping lua_pushnil lua_index = out_index; out_index++; } else lua_index = parse_int(match[2]); if (lua_index == -1) { // -1 means there wasn't a literal used for accessing the index, so i can not parse it if (regex == &lua_push_regex) fi.param_out_cnt = -1; else fi.param_in_cnt = -1; return found; } (*varmap)[lua_index].push_back(lua_type); // always push lua type for now //if (varmap->count(lua_index) == 0) (*varmap)[lua_index].push_back(lua_type); // new entry //else if ((*varmap)[lua_index] != lua_type) //{ // already got an entry, check if it has the same type // fi.parse_errors.push_back("parameter type mismatch for index " + std::to_string(lua_index) + " - " + (*varmap)[lua_index] + " vs " + lua_type); //} line = match.suffix(); } return found; } void chk_vld(FunctionInfo* fi) { // check validity of parsed info - default value is false, so i simply return if invalid bool valid = true; if (fi->parse_errors.size() > 0) valid = false; // any parsing errors occurred if (fi->param_out_cnt != -1 && fi->param_out_cnt != fi->param_out.size()) { fi->parse_errors.push_back("output param mismatch between found number of return values (" + std::to_string(fi->param_out_cnt) +") and lua_push* calls (" + std::to_string(fi->param_out.size()) + ")"); valid = false; } if (fi->param_in_cnt != -1 && !fi->usg.empty()) { // do extra check against usg string std::string usage_params; if (std::regex_search(fi->usg, match, usage_validity_regex)) { usage_params = match[1]; if (fi->param_in_cnt == 0 && !usage_params.empty()) { valid = false; fi->parse_errors.push_back("input param count does not match usage string"); } else { int comma_cnt = 0; for (auto ch : usage_params) if (ch == ',') comma_cnt++; if (fi->param_in_cnt != comma_cnt + 1) { valid = false; fi->parse_errors.push_back("input param count does not match usage string"); } } } else fi->parse_errors.push_back("usage string malformed"); // if the usg string is malformed, do not consider it for validity } if (fi->param_in_cnt > 0) { // check if input param indexes are in order for (int i = 1; i <= fi->param_in_cnt; i++) { // lua indexes start with 1 if (!fi->param_in.contains(i)) { fi->parse_errors.push_back("input param index not in order"); valid = false; break; } } } fi->valid_parse = valid; } void sort_prune_varmap(std::map> *varmap) { for (auto &[lua_index, lua_types] : *varmap) { lua_types.erase(std::unique(lua_types.begin(), lua_types.end()), lua_types.end()); std::sort(lua_types.begin(), lua_types.end()); } } __forceinline bool ln_is(std::string &ln, int idx, const char* lk_for) { int len = strlen(lk_for); return ln.length() - idx > len && ln.substr(idx, len) == lk_for; } void smp_ln_proc(std::string &ln, int i) { } int main() { /* * PARSING RULES * lua_is* calls can occur in if-lines * lua_to* calls can occur in if-lines * lua_push* calls can NOT occur in if-lines * lua_push* calls are not called with an index as param, unlike lua_is* / lua_to* - the index is inferred by the order of calls * lua_push* calls can occur in branch structures (if/else, switch) - they must only be considered in one of them * wow functions return the number of outputs - 0 when they error or there are none; consider any non-0 return as the real number of outputs * wow functions may return a dynamic number of outputs - there is no way to parse this into a single result * skip scanning for the usg string if it already has been encountered in the function, there can only be 1 * skip scanning for input params, if unparsable index (not into int) has been found for lua_is* or lua_to* (dynamic inputs) * skip scanning for output params, if unparsable index (not into int) has been found for lua_push* (dynamic outputs) * skip scanning for output params, if returns with different non-0 values have been found (or it can not be parsed as int) */ if (use_all) { dbg = true; dbg_prnt_invld_only = true; file_path = R"(C:\Users\alphaomega\Documents\Wow.exe.c.txt)"; } else { dbg = true; dbg_prnt_invld_only = false; file_path = R"(C:\Users\alphaomega\Documents\Wow.exe.c.test.txt)"; } auto tstart = std::chrono::high_resolution_clock::now(); std::ifstream source_file(file_path); if (!source_file.is_open()) { std::cerr << "Unable to open file " << file_path << std::endl; return 1; } //std::regex return_regex(R"(^\s+?return\s+([^;]+);$)"); //std::regex usage_regex(R"(^.+"(Usage: [^;]+)N{0}"\){0,1};$)"); // N{0} is just a workaround, since )" in the raw string would terminate it immediately std::regex usage_regex(R"(,"(Usage: [^;]+)N{0}"[, \)])"); // N{0} is just a workaround, since )" in the raw string would terminate it immediately //std::regex lua_is_regex(R"(lua_is(.+?)\(L,(.+?)\))"); //std::regex lua_to_regex(R"(lua_to(.+?)\(L,(.+?)[,\)])"); std::regex lua_is_regex(R"(lua_is(.+?)\()"); std::regex lua_to_regex(R"(lua_to(.+?)\()"); //bool found_return_after_push; std::string skp_push_utl = ""; // skip push parsing until this line has been reached std::string lp_utl = ""; std::map push_track; bool in_func = false; bool enc_usg = false; // encountered usg uint ret_val; // return value bool is_if_ln; std::string ind; // indent std::string ln; // line while (std::getline(source_file, ln)) { { // check for lines which can instantly be discarded if (ln.empty()) continue; // skip empty lines if (ln.starts_with("/*")) continue; // skip comment lines } if (!in_func) { // searching for next function if (ln.starts_with("// ADDRESS - ")) { // found address ln - create new FunctionInfo fi = {}; fi.address = parse_int(ln.substr(13)); out_index = 0; enc_usg = false; skp_push_utl = ""; lp_utl = ""; push_track = {}; continue; } if (ln.starts_with("uint lua_wow_")) { // found ln with function signature fi.func = ln.substr(13, ln.find_first_of("(") - 13); // get part of real function name in_func = true; // start function parsing from now on continue; } } else { // processing current function if (ln == "}") { // end of function found in_func = false; if (fi.param_in_cnt == -1) fi.param_in.clear(); else fi.param_in_cnt = fi.param_in.size(); if (fi.param_out_cnt == -1) fi.param_out.clear(); sort_prune_varmap(&fi.param_in); chk_vld(&fi); fmap[fi.func] = fi; //pdbg("# END OF " + fi.func); continue; } is_if_ln = false; int i = 0; while (i < ln.length()) { if (ln[i] != ' ') { ind = ln.substr(0, i); break; } i++; } switch (ln[i]) { case 'c': case 'd': if (ln_is(ln, i, "case ") || ln_is(ln, i, "default:")) { if (push_track[ind.length() / 2]) skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : ind + "}"; else push_track[ind.length() / 2] = false; // only execute if a previous case didn't already contain a push break; } break; case 'i': // check for if if (ln_is(ln, i, "if ")) { // enough characters left to be if statement is_if_ln = true; push_track[ind.length() / 2] = false; break; } break; case 'e': if (ln_is(ln, i, "else ")) { // enough characters left to be if statement if (push_track[ind.length() / 2]) skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : ln[ln.length() - 1] == '{' ? ind + "}" : "\1"; break; } break; case 'g': if (!fi.param_out.empty() && ln_is(ln, i, "goto ")) { skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : ln.substr(i + 5, ln.length() - i - 6) + ":"; break; } break; case 'r': // check for return if (fi.param_out_cnt == -1) break; // this function has a dynamic number of outputs, no need for further processing if (ln_is(ln, i, "return ")) { // enough characters left to be the simplest return ret_val = parse_int(ln.substr(i + 7, ln.length() - i - 8)); // check if already encountered return value (except 0) matches; dynamic if not if (ret_val != 0) { // TODO probably need to check if i am currently skipping push fi.param_out_cnt = (fi.param_out_cnt == 0 || fi.param_out_cnt == ret_val) ? ret_val : -1; if (ret_val == fi.param_out.size()) { // found a return statement and return value matches output param count skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : "\1skip2end"; } } break; } break; } //pdbg(ln); //pdbg(ind + "<-"); if (!enc_usg) { // usage string can only occur once anyway if (std::regex_search(ln, match, usage_regex)) { // found usg string fi.usg = match[1]; size_t found = -1; // so the first find uses 0 through the increment while ((found = fi.usg.find("\\\"", found + 1)) != std::string::npos) fi.usg.replace(found, 2, "\""); enc_usg = true; skp_push_utl = !skp_push_utl.empty() ? skp_push_utl : "\1"; // sometimes lua_pushfstring is used before lua_error; do not consider this an output! } } if (fi.param_in_cnt != -1) { process_varmap_regex(&fi.param_in, ln, &lua_is_regex); process_varmap_regex(&fi.param_in, ln, &lua_to_regex); } if (fi.param_out_cnt != -1 && skp_push_utl.empty()) { if (process_varmap_regex(&fi.param_out, ln, &lua_push_regex)) { int lvl = is_if_ln ? ind.length() / 2 : (ind.length() - 2) / 2; // do not subtract 1 block level, if this was a simple if line while (lvl >= 1) { // if a push was found, track it for this and all lower block levels push_track[lvl] = true; lvl--; } } } if (!skp_push_utl.empty()) { int i = 0; } if (skp_push_utl == ln) skp_push_utl = ""; // reset skip since line has been reached now if (skp_push_utl == "\1") skp_push_utl = ""; // reset temporary skip which was used for 1 line } } auto tend = std::chrono::high_resolution_clock::now(); auto duration = duration_cast(tend - tstart); int cnt_invalid = 0; int cnt_total = 0; for(auto &[name, fi] : fmap) { if (dbg && (!dbg_prnt_invld_only || !fi.valid_parse)) pfi(fi); cnt_invalid += fi.valid_parse ? 0 : 1; cnt_total++; } std::cout << "GhidraParser is done..." << std::endl; std::cout << "Functions parsed: " << std::to_string(cnt_total) << std::endl; std::cout << "Invalid parses: " << std::to_string(cnt_invalid) << std::endl; std::cout << "duration: " << duration.count() << " ms" << std::endl; std::cin.ignore(); return 0; }