2024-01-22 02:31:32 +01:00
# include <thread>
# include <chrono>
# include <algorithm>
# include <fstream>
# include <iostream>
# include <queue>
# include <regex>
# include <string>
# include <unordered_map>
# include <map>
# include "GhidraParser.hpp"
std : : string file_path ;
bool dbg ;
bool dbg_prnt_invld_only ;
2024-01-22 03:29:57 +01:00
bool use_all = true ; // 46 invalid parses
2024-01-22 02:31:32 +01:00
FunctionInfo fi ;
std : : string func ;
std : : smatch match ;
std : : string lua_type ;
int lua_index ;
int out_index ;
std : : regex lua_push_regex ( R " (lua_push(.+?) \ () " ) ;
std : : regex usage_validity_regex ( R " (Usage: .+? \ ((.*?) \ )) " ) ;
std : : unordered_map < std : : string , FunctionInfo > fmap ;
std : : string get_varmap_info ( std : : map < int , std : : vector < std : : string > > varmap )
{
if ( varmap . size ( ) = = 0 ) return " () " ;
std : : string str = " ( " ;
for ( auto & [ key , values ] : varmap )
{
for ( auto & value : values )
{
str + = value + " / " ;
}
str = str . substr ( 0 , str . length ( ) - 1 ) ;
str + = " , " ;
}
str = str . substr ( 0 , str . length ( ) - 2 ) ;
str + = " ) " ;
return str ;
}
int parse_int ( std : : string input )
{
try
{
if ( input . starts_with ( " 0x " ) ) return std : : stoi ( input , 0 , 16 ) ;
return std : : stoi ( input ) ;
}
catch ( std : : invalid_argument e ) { }
return - 1 ; // couldn't parse; -1 is used for dynamic return
}
void pdbg ( std : : string msg )
{
if ( ! dbg ) return ;
std : : cout < < msg < < ' \n ' ;
}
void pfi ( FunctionInfo & fi )
{ // print FunctionInfo
char endl = ' \n ' ;
std : : cout < < std : : format ( " {:X} " , fi . address ) < < " " < < fi . func < < ( fi . valid_parse ? " " : " (invalid) " ) < < endl ;
if ( ! fi . usg . empty ( ) ) std : : cout < < fi . usg < < endl ;
if ( fi . param_in_cnt = = - 1 ) std : : cout < < " in: dynamic " < < endl ;
else std : : cout < < " in: " < < ( fi . param_in_cnt = = 0 ? " 0 " : std : : to_string ( fi . param_in_cnt ) + " " + get_varmap_info ( fi . param_in ) ) < < endl ;
if ( fi . param_out_cnt = = - 1 ) std : : cout < < " out: dynamic " < < endl ;
else std : : cout < < " out: " < < ( fi . param_out_cnt = = 0 ? " 0 " : std : : to_string ( fi . param_out_cnt ) + " " + get_varmap_info ( fi . param_out ) ) < < endl ;
for ( auto & error : fi . parse_errors ) std : : cout < < error < < endl ;
std : : cout < < std : : endl ;
}
bool process_varmap_regex ( std : : map < int , std : : vector < std : : string > > * varmap , std : : string line , std : : regex * regex )
{
bool found = false ;
while ( std : : regex_search ( line , match , * regex ) )
{
found = true ;
lua_type = match [ 1 ] ;
if ( lua_type = = " lstring " ) lua_type = " string " ; // lstring is a string!
if ( lua_type = = " fstring " ) lua_type = " string " ; // frsting is a string!
if ( regex = = & lua_push_regex )
{ // push parsing uses a global index starting with 1
//if (lua_type == "nil") return false; // skipping lua_pushnil
lua_index = out_index ;
out_index + + ;
}
else lua_index = parse_int ( match [ 2 ] ) ;
if ( lua_index = = - 1 )
{ // -1 means there wasn't a literal used for accessing the index, so i can not parse it
if ( regex = = & lua_push_regex ) fi . param_out_cnt = - 1 ;
else fi . param_in_cnt = - 1 ;
return found ;
}
( * varmap ) [ lua_index ] . push_back ( lua_type ) ; // always push lua type for now
//if (varmap->count(lua_index) == 0) (*varmap)[lua_index].push_back(lua_type); // new entry
//else if ((*varmap)[lua_index] != lua_type)
//{ // already got an entry, check if it has the same type
// fi.parse_errors.push_back("parameter type mismatch for index " + std::to_string(lua_index) + " - " + (*varmap)[lua_index] + " vs " + lua_type);
//}
line = match . suffix ( ) ;
}
return found ;
}
void chk_vld ( FunctionInfo * fi )
{ // check validity of parsed info - default value is false, so i simply return if invalid
bool valid = true ;
if ( fi - > parse_errors . size ( ) > 0 ) valid = false ; // any parsing errors occurred
if ( fi - > param_out_cnt ! = - 1 & & fi - > param_out_cnt ! = fi - > param_out . size ( ) )
{
fi - > parse_errors . push_back ( " output param mismatch between found number of return values ( " + std : : to_string ( fi - > param_out_cnt ) + " ) and lua_push* calls ( " + std : : to_string ( fi - > param_out . size ( ) ) + " ) " ) ;
valid = false ;
}
if ( fi - > param_in_cnt ! = - 1 & & ! fi - > usg . empty ( ) )
{ // do extra check against usg string
std : : string usage_params ;
if ( std : : regex_search ( fi - > usg , match , usage_validity_regex ) )
{
usage_params = match [ 1 ] ;
if ( fi - > param_in_cnt = = 0 & & ! usage_params . empty ( ) )
{
valid = false ;
fi - > parse_errors . push_back ( " input param count does not match usage string " ) ;
}
else
{
int comma_cnt = 0 ;
for ( auto ch : usage_params ) if ( ch = = ' , ' ) comma_cnt + + ;
if ( fi - > param_in_cnt ! = comma_cnt + 1 )
{
valid = false ;
fi - > parse_errors . push_back ( " input param count does not match usage string " ) ;
}
}
}
else fi - > parse_errors . push_back ( " usage string malformed " ) ; // if the usg string is malformed, do not consider it for validity
}
if ( fi - > param_in_cnt > 0 )
{ // check if input param indexes are in order
for ( int i = 1 ; i < = fi - > param_in_cnt ; i + + )
{ // lua indexes start with 1
if ( ! fi - > param_in . contains ( i ) )
{
fi - > parse_errors . push_back ( " input param index not in order " ) ;
valid = false ;
break ;
}
}
}
fi - > valid_parse = valid ;
}
void sort_prune_varmap ( std : : map < int , std : : vector < std : : string > > * varmap )
{
for ( auto & [ lua_index , lua_types ] : * varmap )
{
lua_types . erase ( std : : unique ( lua_types . begin ( ) , lua_types . end ( ) ) , lua_types . end ( ) ) ;
std : : sort ( lua_types . begin ( ) , lua_types . end ( ) ) ;
}
}
__forceinline bool ln_is ( std : : string & ln , int idx , const char * lk_for )
{
int len = strlen ( lk_for ) ;
2024-01-22 03:29:57 +01:00
return ln . length ( ) - idx + 1 > len & & ln . substr ( idx , len ) = = lk_for ;
2024-01-22 02:31:32 +01:00
}
2024-01-22 03:29:57 +01:00
__forceinline void set_str_if_empty ( std : : string * init , std : : string repl )
2024-01-22 02:31:32 +01:00
{
2024-01-22 03:29:57 +01:00
* init = init - > empty ( ) ? repl : * init ;
2024-01-22 02:31:32 +01:00
}
int main ( )
{
/*
* PARSING RULES
* lua_is * calls can occur in if - lines
* lua_to * calls can occur in if - lines
* lua_push * calls can NOT occur in if - lines
* lua_push * calls are not called with an index as param , unlike lua_is * / lua_to * - the index is inferred by the order of calls
* lua_push * calls can occur in branch structures ( if / else , switch ) - they must only be considered in one of them
* wow functions return the number of outputs - 0 when they error or there are none ; consider any non - 0 return as the real number of outputs
* wow functions may return a dynamic number of outputs - there is no way to parse this into a single result
* skip scanning for the usg string if it already has been encountered in the function , there can only be 1
* skip scanning for input params , if unparsable index ( not into int ) has been found for lua_is * or lua_to * ( dynamic inputs )
* skip scanning for output params , if unparsable index ( not into int ) has been found for lua_push * ( dynamic outputs )
* skip scanning for output params , if returns with different non - 0 values have been found ( or it can not be parsed as int )
*/
if ( use_all )
{
dbg = true ;
dbg_prnt_invld_only = true ;
file_path = R " (C: \ Users \a lphaomega \ Documents \ Wow.exe.c.txt) " ;
}
else
{
dbg = true ;
dbg_prnt_invld_only = false ;
file_path = R " (C: \ Users \a lphaomega \ Documents \ Wow.exe.c.test.txt) " ;
}
auto tstart = std : : chrono : : high_resolution_clock : : now ( ) ;
std : : ifstream source_file ( file_path ) ;
if ( ! source_file . is_open ( ) )
{
std : : cerr < < " Unable to open file " < < file_path < < std : : endl ;
return 1 ;
}
std : : regex usage_regex ( R " (, " ( Usage : [ ^ ; ] + ) N { 0 } " [, \ )]) " ) ; // N{0} is just a workaround, since )" in the raw string would terminate it immediately
std : : regex lua_is_regex ( R " (lua_is(.+?) \ () " ) ;
std : : regex lua_to_regex ( R " (lua_to(.+?) \ () " ) ;
std : : string skp_push_utl = " " ; // skip push parsing until this line has been reached
std : : string lp_utl = " " ;
std : : map < int , bool > push_track ;
bool in_func = false ;
bool enc_usg = false ; // encountered usg
uint ret_val ; // return value
bool is_if_ln ;
std : : string ind ; // indent
std : : string ln ; // line
while ( std : : getline ( source_file , ln ) )
{
{ // check for lines which can instantly be discarded
if ( ln . empty ( ) ) continue ; // skip empty lines
if ( ln . starts_with ( " /* " ) ) continue ; // skip comment lines
}
if ( ! in_func )
{ // searching for next function
if ( ln . starts_with ( " // ADDRESS - " ) )
2024-01-22 03:29:57 +01:00
{ // found address ln - create new FunctionInfo and reset variables
2024-01-22 02:31:32 +01:00
fi = { } ;
fi . address = parse_int ( ln . substr ( 13 ) ) ;
out_index = 0 ;
enc_usg = false ;
lp_utl = " " ;
2024-01-22 03:29:57 +01:00
skp_push_utl = " " ;
2024-01-22 02:31:32 +01:00
push_track = { } ;
continue ;
}
if ( ln . starts_with ( " uint lua_wow_ " ) )
{ // found ln with function signature
fi . func = ln . substr ( 13 , ln . find_first_of ( " ( " ) - 13 ) ; // get part of real function name
in_func = true ; // start function parsing from now on
continue ;
}
}
else
{ // processing current function
if ( ln = = " } " )
{ // end of function found
in_func = false ;
if ( fi . param_in_cnt = = - 1 ) fi . param_in . clear ( ) ;
else fi . param_in_cnt = fi . param_in . size ( ) ;
if ( fi . param_out_cnt = = - 1 ) fi . param_out . clear ( ) ;
sort_prune_varmap ( & fi . param_in ) ;
chk_vld ( & fi ) ;
fmap [ fi . func ] = fi ;
//pdbg("# END OF " + fi.func);
continue ;
}
is_if_ln = false ;
int i = 0 ;
while ( i < ln . length ( ) )
{
if ( ln [ i ] ! = ' ' )
{
ind = ln . substr ( 0 , i ) ;
break ;
}
i + + ;
}
switch ( ln [ i ] )
{
case ' c ' :
case ' d ' :
2024-01-22 03:29:57 +01:00
if ( ln_is ( ln , i , " do { " ) )
{ // setting loop end, if not already in a loop
set_str_if_empty ( & lp_utl , ind + " } " ) ;
}
2024-01-22 02:31:32 +01:00
if ( ln_is ( ln , i , " case " ) | | ln_is ( ln , i , " default: " ) )
{
2024-01-22 03:29:57 +01:00
if ( push_track [ ind . length ( ) / 2 ] ) set_str_if_empty ( & skp_push_utl , ind + " } " ) ;
2024-01-22 02:31:32 +01:00
else push_track [ ind . length ( ) / 2 ] = false ; // only execute if a previous case didn't already contain a push
break ;
}
break ;
case ' i ' : // check for if
if ( ln_is ( ln , i , " if " ) )
{ // enough characters left to be if statement
is_if_ln = true ;
push_track [ ind . length ( ) / 2 ] = false ;
break ;
}
break ;
case ' e ' :
if ( ln_is ( ln , i , " else " ) )
{ // enough characters left to be if statement
2024-01-22 03:29:57 +01:00
if ( push_track [ ind . length ( ) / 2 ] ) set_str_if_empty ( & skp_push_utl , ln [ ln . length ( ) - 1 ] = = ' { ' ? ind + " } " : " \1 " ) ;
2024-01-22 02:31:32 +01:00
break ;
}
break ;
case ' g ' :
if ( ! fi . param_out . empty ( ) & & ln_is ( ln , i , " goto " ) )
{
2024-01-22 03:29:57 +01:00
set_str_if_empty ( & skp_push_utl , ln . substr ( i + 5 , ln . length ( ) - i - 6 ) + " : " ) ;
2024-01-22 02:31:32 +01:00
break ;
}
break ;
case ' r ' : // check for return
if ( fi . param_out_cnt = = - 1 ) break ; // this function has a dynamic number of outputs, no need for further processing
if ( ln_is ( ln , i , " return " ) )
{ // enough characters left to be the simplest return
ret_val = parse_int ( ln . substr ( i + 7 , ln . length ( ) - i - 8 ) ) ;
// check if already encountered return value (except 0) matches; dynamic if not
if ( ret_val ! = 0 )
{ // TODO probably need to check if i am currently skipping push
fi . param_out_cnt = ( fi . param_out_cnt = = 0 | | fi . param_out_cnt = = ret_val ) ? ret_val : - 1 ;
if ( ret_val = = fi . param_out . size ( ) )
{ // found a return statement and return value matches output param count
2024-01-22 03:29:57 +01:00
set_str_if_empty ( & skp_push_utl , " \1 skip2end " ) ;
2024-01-22 02:31:32 +01:00
}
}
break ;
}
break ;
2024-01-22 03:29:57 +01:00
case ' w ' :
if ( ln_is ( ln , i , " while ( " ) )
{ // setting loop end, if not already in a loop
set_str_if_empty ( & lp_utl , ind + " } " ) ;
}
break ;
2024-01-22 02:31:32 +01:00
}
//pdbg(ln);
//pdbg(ind + "<-");
if ( ! enc_usg )
{ // usage string can only occur once anyway
if ( std : : regex_search ( ln , match , usage_regex ) )
{ // found usg string
fi . usg = match [ 1 ] ;
size_t found = - 1 ; // so the first find uses 0 through the increment
while ( ( found = fi . usg . find ( " \\ \" " , found + 1 ) ) ! = std : : string : : npos ) fi . usg . replace ( found , 2 , " \" " ) ;
enc_usg = true ;
skp_push_utl = ! skp_push_utl . empty ( ) ? skp_push_utl : " \1 " ; // sometimes lua_pushfstring is used before lua_error; do not consider this an output!
}
}
if ( fi . param_in_cnt ! = - 1 )
{
process_varmap_regex ( & fi . param_in , ln , & lua_is_regex ) ;
process_varmap_regex ( & fi . param_in , ln , & lua_to_regex ) ;
}
2024-01-22 03:29:57 +01:00
if ( fi . param_out_cnt ! = - 1 & & skp_push_utl . empty ( ) & & process_varmap_regex ( & fi . param_out , ln , & lua_push_regex ) )
2024-01-22 02:31:32 +01:00
{
2024-01-22 03:29:57 +01:00
if ( ! lp_utl . empty ( ) )
{ // found lua_push* inside a loop
fi . param_out_cnt = - 1 ;
}
else
{ // normal processing
2024-01-22 02:31:32 +01:00
int lvl = is_if_ln ? ind . length ( ) / 2 : ( ind . length ( ) - 2 ) / 2 ; // do not subtract 1 block level, if this was a simple if line
while ( lvl > = 1 )
{ // if a push was found, track it for this and all lower block levels
push_track [ lvl ] = true ;
lvl - - ;
}
}
2024-01-22 03:29:57 +01:00
2024-01-22 02:31:32 +01:00
}
2024-01-22 03:29:57 +01:00
if ( ! lp_utl . empty ( ) & & ln . starts_with ( lp_utl ) )
lp_utl = " " ;
if ( ! skp_push_utl . empty ( ) & & ln . starts_with ( skp_push_utl ) )
2024-01-22 02:31:32 +01:00
skp_push_utl = " " ; // reset skip since line has been reached now
if ( skp_push_utl = = " \1 " )
skp_push_utl = " " ; // reset temporary skip which was used for 1 line
}
}
auto tend = std : : chrono : : high_resolution_clock : : now ( ) ;
auto duration = duration_cast < std : : chrono : : milliseconds > ( tend - tstart ) ;
int cnt_invalid = 0 ;
int cnt_total = 0 ;
for ( auto & [ name , fi ] : fmap )
{
if ( dbg & & ( ! dbg_prnt_invld_only | | ! fi . valid_parse ) ) pfi ( fi ) ;
cnt_invalid + = fi . valid_parse ? 0 : 1 ;
cnt_total + + ;
}
std : : cout < < " GhidraParser is done... " < < std : : endl ;
std : : cout < < " Functions parsed: " < < std : : to_string ( cnt_total ) < < std : : endl ;
std : : cout < < " Invalid parses: " < < std : : to_string ( cnt_invalid ) < < std : : endl ;
std : : cout < < " duration: " < < duration . count ( ) < < " ms " < < std : : endl ;
std : : cin . ignore ( ) ;
return 0 ;
}