403 Downloads Updated 8 months ago
Updated 8 months ago
8 months ago
3f004b3ceba8 · 44GB ·
LLM4Decompile aims to decompile x86 assembly instructions into C. The newly released V2 series are trained with a larger dataset (2B tokens) and a maximum token length of 4,096, with remarkable performance (up to 100% improvement) compared to the previous model.
| Metrics | Re-executability Rate | Edit Similarity | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| Optimization Level | O0 | O1 | O2 | O3 | AVG | O0 | O1 | O2 | O3 | AVG | 
| LLM4Decompile-End-6.7B | 0.6805 | 0.3951 | 0.3671 | 0.3720 | 0.4537 | 0.1557 | 0.1292 | 0.1293 | 0.1269 | 0.1353 | 
| Ghidra | 0.3476 | 0.1646 | 0.1524 | 0.1402 | 0.2012 | 0.0699 | 0.0613 | 0.0619 | 0.0547 | 0.0620 | 
| +GPT-4o | 0.4695 | 0.3415 | 0.2866 | 0.3110 | 0.3522 | 0.0660 | 0.0563 | 0.0567 | 0.0499 | 0.0572 | 
| +LLM4Decompile-Ref-1.3B | 0.6890 | 0.3720 | 0.4085 | 0.3720 | 0.4604 | 0.1517 | 0.1325 | 0.1292 | 0.1267 | 0.1350 | 
| +LLM4Decompile-Ref-6.7B | 0.7439 | 0.4695 | 0.4756 | 0.4207 | 0.5274 | 0.1559 | 0.1353 | 0.1342 | 0.1273 | 0.1382 | 
| +LLM4Decompile-Ref-33B | 0.7073 | 0.4756 | 0.4390 | 0.4146 | 0.5091 | 0.1540 | 0.1379 | 0.1363 | 0.1307 | 0.1397 | 
ollama pull MHKetbi/llm4decompile-22b-v2.pip install ollamadecompiler.py).my_assembly_file_O0.pseudo (replace my_assembly_file with your desired filename) containing the assembly code of the function you want to decompile.python decompiler.py my_assembly_fileimport ollama
import sys
MODEL_NAME = 'MHKetbi/llm4decompile-22b-v2'
def decompile_with_ollama(asm_func, max_tokens=2048):
    """
    Decompiles an assembly function using Ollama.
    Args:
        asm_func: The assembly function as a string.
        max_tokens: The maximum number of tokens to generate.
    Returns:
        The decompiled C code as a string, or None if an error occurred.
    """
    try:
        # Construct a prompt for the Ollama model.  This is *crucial* for good results.
        # We're using a system prompt and a user prompt.  The system prompt sets the context.
        # The user prompt provides the input and asks for the output.
        messages = [
            {
                'role': 'system',
                'content': 'You are a helpful assistant that decompiles assembly code to C code.  '
                           'Provide only the C code, without any extra explanation or comments. '
                           'Do not include markdown formatting. Do not wrap the output in ```.'
            },
            {
                'role': 'user',
                'content': f'Decompile the following assembly code to C:\n\n{asm_func}'
            }
        ]
        response = ollama.chat(model=MODEL_NAME, messages=messages, stream=False)
        # Check if the response is valid and contains the decompiled code.
        if response and response.get('message') and response['message'].get('content'):
            c_func_decompile = response['message']['content'].strip()
            return c_func_decompile
        else:
            print("Error: Ollama did not return a valid response.", file=sys.stderr)
            return None
    except ollama.ResponseError as e:
        print(f"Error during Ollama request: {e}", file=sys.stderr)
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)
        return None
def decompile_with_ollama_streaming(asm_func, max_tokens=2048):
    """
    Decompiles an assembly function using Ollama with streaming.
    Args:
        asm_func: The assembly function as a string.
        max_tokens:  (Not directly used in streaming, but kept for consistency).
    Returns:
        The decompiled C code as a string, or None if an error occurred.
    """
    try:
        messages = [
            {
                'role': 'system',
                'content': 'You are a helpful assistant that decompiles assembly code to C code.  '
                           'Provide only the C code, without any extra explanation or comments. '
                           'Do not include markdown formatting. Do not wrap the output in ```.'
            },
            {
                'role': 'user',
                'content': f'Decompile the following assembly code to C:\n\n{asm_func}'
            }
        ]
        stream = ollama.chat(model=MODEL_NAME, messages=messages, stream=True)
        c_func_decompile = ""
        for chunk in stream:
            if chunk and chunk.get('message') and chunk['message'].get('content'):
                c_func_decompile += chunk['message']['content']
        return c_func_decompile.strip()
    except ollama.ResponseError as e:
        print(f"Error during Ollama request: {e}", file=sys.stderr)
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)
        return None
def main():
    if len(sys.argv) != 2:
        print("Usage: python script.py <filename>")
        sys.exit(1)
    file_name = sys.argv[1]
    opt = ['O0']  #  Keep this for consistency with the original script's file naming.
    try:
        with open(f'{file_name}_{opt[0]}.pseudo', 'r') as f:
            asm_func = f.read()
    except FileNotFoundError:
        print(f"Error: File '{file_name}_{opt[0]}.pseudo' not found.", file=sys.stderr)
        sys.exit(1)
    # Choose either streaming or non-streaming version
    # c_func_decompile = decompile_with_ollama(asm_func)
    c_func_decompile = decompile_with_ollama_streaming(asm_func)
    if c_func_decompile:
        try:
            with open(f'{file_name}_{opt[0]}.pseudo', 'r') as f:  # original file
                func = f.read()
            print(f'pseudo function:\n{func}')
            print(f'refined function:\n{c_func_decompile}')
        except FileNotFoundError:
            print(f"Error: Could not read original pseudo file (for display only).", file=sys.stderr)
            print(f'refined function:\n{c_func_decompile}') # Still print the decompiled output
    else:
        print("Decompilation failed.", file=sys.stderr)
if __name__ == "__main__":
    main()
This code repository is licensed under the MIT License.
If you have any questions, please raise an issue.