From a0665d8c46e38b1306759821adf0172421614e23 Mon Sep 17 00:00:00 2001 From: Minoru Kobayashi Date: Mon, 29 Aug 2022 16:52:42 +0900 Subject: [PATCH 1/3] Improved Windows compatibility and string format, added CPU ARM64 Improved compatibility with Windows and string format for hash (more closer imphash), added CPU type ARM64 --- .gitignore | 2 ++ requirements.txt | 5 +++-- setup.py | 7 ++++--- symhash/__init__.py | 35 ++++++++++++++++++++++++++++------- symhash/machoinfo.py | 23 ++++++++++++++++++----- 5 files changed, 55 insertions(+), 17 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d96bdda --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +symhash.egg-info/ diff --git a/requirements.txt b/requirements.txt index 3ee1c3a..359ba98 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ -filemagic==1.6 -future==0.15.2 +python-magic>=0.4.27 +python-magic-bin>=0.4.14 +future>=0.18.2 diff --git a/setup.py b/setup.py index 94d7f61..aac74e7 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='symhash', - version='0.0.2', + version='0.0.3', url='https://github.com/threatstream/symhash', author='Aaron Shelmire', author_email='aaron.shelmire@anomali.com', @@ -16,8 +16,9 @@ 'bin/symhash' ], install_requires=[ - 'filemagic==1.6', - 'future==0.15.2', + 'python-magic>=0.4.27', + 'python-magic-bin>=0.4.14', + 'future>=0.18.2', ], description='Anomali Symhash', classifiers=[ diff --git a/symhash/__init__.py b/symhash/__init__.py index 02c3d7f..80aec78 100755 --- a/symhash/__init__.py +++ b/symhash/__init__.py @@ -12,12 +12,24 @@ # symhash walks the symbol table (read: loaded API calls) # creates a list and hashes those -import magic +import os from hashlib import md5 +import magic + from symhash.machoinfo import MachOEntity, MachOParser, MachOParserError +def get_dylib_name_by_ordinal(dylib_list, library_ordinal, basename_only = False): + if library_ordinal > 0 and library_ordinal <= 253: + if basename_only: + return os.path.basename(dylib_list[library_ordinal - 1]) + else: + return dylib_list[library_ordinal - 1] + elif library_ordinal in (0, 254, 255): # 0 = invalid, 254 = DYNAMIC_LOOKUP_ORDINAL, 255 = EXECUTABLE_ORDINAL + return None + + def create_sym_hash(filename=None, data=None): # create the sym hash if filename: @@ -27,8 +39,7 @@ def create_sym_hash(filename=None, data=None): if not data: return - with magic.Magic() as m: - filetype = m.id_buffer(data[0:1000]) + filetype = magic.from_buffer(data[0:1024]) if 'Mach-O' not in filetype: print("Data provided is not a valid Mach-O filetype") @@ -51,16 +62,26 @@ def create_sym_hash(filename=None, data=None): entity.filetype_str, entity.magic_str) - sym_list = [] + dylib_list = [] + for cmd in entity.cmdlist: + if cmd['cmd'] == MachOEntity.LC_LOAD_DYLIB: + dylib_list.append(cmd['dylib'].decode()) + sym_list = [] for cmd in entity.cmdlist: if cmd['cmd'] == MachOEntity.LC_SYMTAB: for sym in cmd['symbols']: if not sym['is_stab']: if sym['external'] is True: - if sym['n_type'] == '0x00': - sym_list.append(sym.get('string', '').decode()) - + if sym['n_type'] == '0x00': # 0x00 = N_UNDF + library_ordinal = (sym['n_desc'] >> 8) & 0xff + if library_ordinal > 0: + dylib_name = get_dylib_name_by_ordinal(dylib_list, library_ordinal) + if dylib_name: + sym_list.append("{}.{}".format(dylib_name, sym.get('string', '').decode())) + # print("{}\t{}".format(dylib_name, sym.get('string', '').decode())) + + # print(','.join(sorted(sym_list)).encode()) symhash = md5(','.join(sorted(sym_list)).encode()).hexdigest() sym_dict[entity_string] = symhash diff --git a/symhash/machoinfo.py b/symhash/machoinfo.py index 158dfd2..6cbb9b7 100644 --- a/symhash/machoinfo.py +++ b/symhash/machoinfo.py @@ -39,12 +39,12 @@ # XXX: There are a lot of comments indicating we should check we aren't # parsing past the end of a slice. These should all be fixed. ;) -import struct import binascii - -from hashlib import md5 +import struct from builtins import range from datetime import datetime +from hashlib import md5 + from future.utils import iteritems @@ -63,11 +63,14 @@ class MachOEntity(object): # CPU Types (not complete) CPU_ARCH_ABI64 = 0x01000000 + CPU_ARCH_ABI64_32 = 0x02000000 CPU_TYPE_POWERPC = 0x00000012 CPU_TYPE_X86 = 0x00000007 CPU_TYPE_ARM = 0x0000000C CPU_TYPE_POWERPC64 = CPU_TYPE_POWERPC | CPU_ARCH_ABI64 CPU_TYPE_X86_64 = CPU_TYPE_X86 | CPU_ARCH_ABI64 + CPU_TYPE_ARM64 = CPU_TYPE_ARM | CPU_ARCH_ABI64 + CPU_TYPE_ARM64_32 = CPU_TYPE_ARM | CPU_ARCH_ABI64_32 # CPU Subtypes (not complete) CPU_SUBTYPE_MASK = 0xFF000000 @@ -83,6 +86,11 @@ class MachOEntity(object): CPU_SUBTYPE_ARM_V7 = 0x00000009 CPU_SUBTYPE_ARM_V7F = 0x0000000A CPU_SUBTYPE_ARM_V7K = 0x0000000C + CPU_SUBTYPE_ARM64_ALL = 0x00000000 + CPU_SUBTYPE_ARM64_V8 = 0x00000001 + CPU_SUBTYPE_ARM64E = 0x00000002 + CPU_SUBTYPE_ARM64_32_ALL = 0x00000000 + CPU_SUBTYPE_ARM64_32_V8 = 0x00000001 # Filetype MH_OBJECT = 0x00000001 @@ -312,7 +320,8 @@ def __init__(self): self.CPU_TYPE_X86: 'Intel', self.CPU_TYPE_POWERPC64: 'PPC64', self.CPU_TYPE_X86_64: 'Intel (64-bit)', - self.CPU_TYPE_ARM: 'ARM' + self.CPU_TYPE_ARM: 'ARM', + self.CPU_TYPE_ARM64: 'ARM64' } # CPU subtype mapping @@ -614,7 +623,7 @@ def cpu_subtype_str(self): return self.cpu_ppc_subtypes.get(self.cpu_subtype & ~self.CPU_SUBTYPE_MASK, "0x%08x" % self.cpu_subtype) elif self.cpu_type in [self.CPU_TYPE_X86, self.CPU_TYPE_X86_64]: return self.cpu_x86_subtypes.get(self.cpu_subtype & ~self.CPU_SUBTYPE_MASK, "0x%08x" % self.cpu_subtype) - elif self.cpu_type in [self.CPU_TYPE_ARM]: + elif self.cpu_type in [self.CPU_TYPE_ARM, self.CPU_TYPE_ARM64]: return self.cpu_arm_subtypes.get(self.cpu_subtype & ~self.CPU_SUBTYPE_MASK, "0x%08x" % self.cpu_subtype) else: return "0x%08x" % self.cpu_subtype @@ -1017,6 +1026,10 @@ def parse_lc_symtab_sub(self, cmd_dict, data): else: sym['external'] = False + sym['n_sect'] = n_sect + sym['n_desc'] = n_desc + sym['n_value'] = n_value + symbols.append(sym) ptr = ptr[nlist_size:] From b6a6dabaab0d0b1ba411816e24b5322b2412b603 Mon Sep 17 00:00:00 2001 From: Minoru Kobayashi Date: Thu, 1 Sep 2022 10:54:58 +0900 Subject: [PATCH 2/3] Added symfuzzy command and refactored other code --- bin/symfuzzy | 57 +++++++++++++++++++++++ requirements.txt | 1 + setup.py | 1 + symhash/__init__.py | 109 +++++++++++++++++++++++++++++--------------- 4 files changed, 130 insertions(+), 38 deletions(-) create mode 100644 bin/symfuzzy diff --git a/bin/symfuzzy b/bin/symfuzzy new file mode 100644 index 0000000..6b5f461 --- /dev/null +++ b/bin/symfuzzy @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +# +# Fussy hashing for Mach-O symbol table +# This program is inspired by impfuzzy (https://github.com/JPCERTCC/impfuzzy) +# +# Copyright (C) 2022 Minoru Kobayashi (@unkn0wnbit) +# +# This software is released under the MIT License. +# https://opensource.org/licenses/MIT +# + +import argparse +import sys + +import ssdeep +from symhash import create_sym_fuzzyhash + + +def main(): + parser = argparse.ArgumentParser(description='SymFuzzy: a program to calculate Fuzzy Hash from symbol table of Mach-O files.') + parser.add_argument('-f', '--file', action='store', type=str, help='Specify a Mach-O file to calculate Fuzzy Hash.', required=True) + parser.add_argument('-f2', '--file2', action='store', type=str, help='Specify a Mach-O file to be compared with Fuzzy Hash of "--file"') + args = parser.parse_args() + + sym_fuzzyhash = create_sym_fuzzyhash(args.file) + if not sym_fuzzyhash: + return + + if args.file2: + sym_fuzzyhash2 = create_sym_fuzzyhash(args.file2) + if not sym_fuzzyhash2: + return + + for arch, fuzzyhash in sym_fuzzyhash.items(): + try: + fuzzyhash2 = sym_fuzzyhash2.pop(arch) + match_value = ssdeep.compare(fuzzyhash, fuzzyhash2) + print("Binary architecture: {}".format(arch)) + print("{}: {}".format(args.file, fuzzyhash)) + print("{}: {}".format(args.file2, fuzzyhash2)) + print("Match value: {}".format(match_value)) + print("-"*50) + except KeyError: + print("{} does not have an architecture binary for\"{}\"".format(args.file2, arch)) + + if len(sym_fuzzyhash2) > 0: + for arch in sym_fuzzyhash2.keys(): + print("{} does not have an architecture binary for\"{}\"".format(args.file, arch)) + + else: + for arch, fuzzyhash in sym_fuzzyhash.items(): + print("{}: {}".format(arch, fuzzyhash)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/requirements.txt b/requirements.txt index 359ba98..0cb895d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ python-magic>=0.4.27 python-magic-bin>=0.4.14 future>=0.18.2 +ssdeep>=3.4 diff --git a/setup.py b/setup.py index aac74e7..c0f1a1a 100755 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ 'python-magic>=0.4.27', 'python-magic-bin>=0.4.14', 'future>=0.18.2', + 'ssdeep>=3.4', ], description='Anomali Symhash', classifiers=[ diff --git a/symhash/__init__.py b/symhash/__init__.py index 80aec78..6508171 100755 --- a/symhash/__init__.py +++ b/symhash/__init__.py @@ -13,28 +13,22 @@ # creates a list and hashes those import os +import sys from hashlib import md5 import magic +import ssdeep from symhash.machoinfo import MachOEntity, MachOParser, MachOParserError -def get_dylib_name_by_ordinal(dylib_list, library_ordinal, basename_only = False): - if library_ordinal > 0 and library_ordinal <= 253: - if basename_only: - return os.path.basename(dylib_list[library_ordinal - 1]) - else: - return dylib_list[library_ordinal - 1] - elif library_ordinal in (0, 254, 255): # 0 = invalid, 254 = DYNAMIC_LOOKUP_ORDINAL, 255 = EXECUTABLE_ORDINAL - return None - - -def create_sym_hash(filename=None, data=None): - # create the sym hash +def parse_macho(filename=None, data=None): if filename: - with open(filename, 'rb') as f: - data = f.read() + if os.path.isfile(filename): + with open(filename, 'rb') as f: + data = f.read() + else: + sys.exit("Error: {} is not a file.".format(filename)) if not data: return @@ -53,36 +47,75 @@ def create_sym_hash(filename=None, data=None): print("Error {}".format(e)) return + return macho_parser + + +def get_dylib_name_by_ordinal(dylib_list, library_ordinal, basename_only = False): + if library_ordinal > 0 and library_ordinal <= 253: + if basename_only: + return os.path.basename(dylib_list[library_ordinal - 1]) + else: + return dylib_list[library_ordinal - 1] + elif library_ordinal in (0, 254, 255): # 0 = invalid, 254 = DYNAMIC_LOOKUP_ORDINAL, 255 = EXECUTABLE_ORDINAL + return None + + +def get_dylib_list(entity): + dylib_list = [] + for cmd in entity.cmdlist: + if cmd['cmd'] == MachOEntity.LC_LOAD_DYLIB: + dylib_list.append(cmd['dylib'].decode()) + + return dylib_list + + +def get_import_symbol_list(entity, dylib_list): + sym_list = [] + for cmd in entity.cmdlist: + if cmd['cmd'] == MachOEntity.LC_SYMTAB: + for sym in cmd['symbols']: + if not sym['is_stab']: + if sym['external'] is True: + if sym['n_type'] == '0x00': # 0x00 = N_UNDF + library_ordinal = (sym['n_desc'] >> 8) & 0xff + if library_ordinal > 0: + dylib_name = get_dylib_name_by_ordinal(dylib_list, library_ordinal) + if dylib_name: + sym_list.append("{}.{}".format(dylib_name, sym.get('string', '').decode())) + # print("{}\t{}".format(dylib_name, sym.get('string', '').decode())) + + return sym_list + + +def create_sym_hash(filename=None, data=None): + macho_parser = parse_macho(filename, data) sym_dict = {} for entity in macho_parser.entities: if entity.magic_str != 'Universal': - - entity_string = "{} {} {}".format(entity.cpu_type_str, - entity.filetype_str, - entity.magic_str) - - dylib_list = [] - for cmd in entity.cmdlist: - if cmd['cmd'] == MachOEntity.LC_LOAD_DYLIB: - dylib_list.append(cmd['dylib'].decode()) - - sym_list = [] - for cmd in entity.cmdlist: - if cmd['cmd'] == MachOEntity.LC_SYMTAB: - for sym in cmd['symbols']: - if not sym['is_stab']: - if sym['external'] is True: - if sym['n_type'] == '0x00': # 0x00 = N_UNDF - library_ordinal = (sym['n_desc'] >> 8) & 0xff - if library_ordinal > 0: - dylib_name = get_dylib_name_by_ordinal(dylib_list, library_ordinal) - if dylib_name: - sym_list.append("{}.{}".format(dylib_name, sym.get('string', '').decode())) - # print("{}\t{}".format(dylib_name, sym.get('string', '').decode())) - + dylib_list = get_dylib_list(entity) + sym_list = get_import_symbol_list(entity, dylib_list) # print(','.join(sorted(sym_list)).encode()) + # print("Number of symbols: {}".format(len(sym_list))) symhash = md5(','.join(sorted(sym_list)).encode()).hexdigest() + entity_string = "{} {} {}".format(entity.cpu_type_str, entity.filetype_str, entity.magic_str) sym_dict[entity_string] = symhash return sym_dict + + +def create_sym_fuzzyhash(filename=None, data=None): + macho_parser = parse_macho(filename, data) + sym_fuzzy_dict = {} + + for entity in macho_parser.entities: + if entity.magic_str != 'Universal': + dylib_list = get_dylib_list(entity) + sym_list = get_import_symbol_list(entity, dylib_list) + # print(','.join(sorted(sym_list)).encode()) + # print("Number of symbols: {}".format(len(sym_list))) + symfuzzyhash = ssdeep.hash(','.join(sorted(sym_list)).encode()) + entity_string = "{} {} {}".format(entity.cpu_type_str, entity.filetype_str, entity.magic_str) + sym_fuzzy_dict[entity_string] = symfuzzyhash + + return sym_fuzzy_dict From ddaf85deaaa5736d5197f3c68559dba062dbb5dd Mon Sep 17 00:00:00 2001 From: Minoru Kobayashi Date: Fri, 2 Sep 2022 14:57:06 +0900 Subject: [PATCH 3/3] Added "mode" option --- bin/symfuzzy | 24 +++++++++++++++++++----- bin/symhash | 18 ++++++++++++++++-- setup.py | 4 +--- symhash/__init__.py | 44 ++++++++++++++++++++++++++++++++++---------- 4 files changed, 70 insertions(+), 20 deletions(-) diff --git a/bin/symfuzzy b/bin/symfuzzy index 6b5f461..0d6344e 100644 --- a/bin/symfuzzy +++ b/bin/symfuzzy @@ -14,21 +14,35 @@ import argparse import sys import ssdeep -from symhash import create_sym_fuzzyhash +from symhash import HashMode, create_sym_fuzzyhash def main(): parser = argparse.ArgumentParser(description='SymFuzzy: a program to calculate Fuzzy Hash from symbol table of Mach-O files.') - parser.add_argument('-f', '--file', action='store', type=str, help='Specify a Mach-O file to calculate Fuzzy Hash.', required=True) - parser.add_argument('-f2', '--file2', action='store', type=str, help='Specify a Mach-O file to be compared with Fuzzy Hash of "--file"') + parser.add_argument('-f', '--file', action='store', type=str, + help='Specify a Mach-O file to calculate Fuzzy Hash.', required=True) + parser.add_argument('-f2', '--file2', action='store', type=str, + help='Specify a Mach-O file to be compared with Fuzzy Hash of "--file"') + parser.add_argument('-m', '--mode', action='store', type=str, default='ALL', + help='Order of APIs for calculation hash, which can be specified "SYMTAB" or "SORT".') args = parser.parse_args() - sym_fuzzyhash = create_sym_fuzzyhash(args.file) + args.mode = args.mode.upper() + if args.mode == 'ALL': + hash_mode = HashMode.ALL + elif args.mode == 'SYMTAB': + hash_mode = HashMode.SYMTAB + elif args.mode == 'SORT': + hash_mode = HashMode.SORT + else: + sys.exit("Error: Unsupported mode = {}".format(args.mode)) + + sym_fuzzyhash = create_sym_fuzzyhash(args.file, hash_mode=hash_mode) if not sym_fuzzyhash: return if args.file2: - sym_fuzzyhash2 = create_sym_fuzzyhash(args.file2) + sym_fuzzyhash2 = create_sym_fuzzyhash(args.file2, hash_mode=hash_mode) if not sym_fuzzyhash2: return diff --git a/bin/symhash b/bin/symhash index e75da79..fbea052 100755 --- a/bin/symhash +++ b/bin/symhash @@ -13,9 +13,10 @@ # creates a list and hashes those import argparse +import sys from future.utils import iteritems -from symhash import create_sym_hash +from symhash import HashMode, create_sym_hash def main(): @@ -24,15 +25,28 @@ def main(): opt.add_argument( '-f', '--file', help='The file to create a SymHash from', required=True ) + opt.add_argument('-m', '--mode', action='store', type=str, default='ALL', + help='Order of APIs for calculation hash, which can be specified "SYMTAB" or "SORT".') opt.add_argument( '-v', '--verbose', help='Verbose output', required=False, action='store_true' ) options = opt.parse_args() + + options.mode = options.mode.upper() + if options.mode == 'ALL': + hash_mode = HashMode.ALL + elif options.mode == 'SYMTAB': + hash_mode = HashMode.SYMTAB + elif options.mode == 'SORT': + hash_mode = HashMode.SORT + else: + sys.exit("Error: Unsupported mode = {}".format(options.mode)) + f_name = options.file - s = create_sym_hash(f_name) + s = create_sym_hash(f_name, hash_mode=hash_mode) if not s: return diff --git a/setup.py b/setup.py index c0f1a1a..f25b0bf 100755 --- a/setup.py +++ b/setup.py @@ -26,8 +26,6 @@ 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', 'Operating System :: POSIX', 'Programming Language :: Other Scripting Engines', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.6', ] ) diff --git a/symhash/__init__.py b/symhash/__init__.py index 6508171..7f2f108 100755 --- a/symhash/__init__.py +++ b/symhash/__init__.py @@ -14,6 +14,7 @@ import os import sys +from enum import Flag, auto from hashlib import md5 import magic @@ -22,6 +23,12 @@ from symhash.machoinfo import MachOEntity, MachOParser, MachOParserError +class HashMode(Flag): + SYMTAB = auto() + SORT = auto() + ALL = SYMTAB | SORT + + def parse_macho(filename=None, data=None): if filename: if os.path.isfile(filename): @@ -84,10 +91,13 @@ def get_import_symbol_list(entity, dylib_list): sym_list.append("{}.{}".format(dylib_name, sym.get('string', '').decode())) # print("{}\t{}".format(dylib_name, sym.get('string', '').decode())) + # print(','.join(sorted(sym_list)).encode()) + # print("Number of symbols: {}".format(len(sym_list))) + return sym_list -def create_sym_hash(filename=None, data=None): +def create_sym_hash(filename=None, data=None, hash_mode=HashMode.ALL): macho_parser = parse_macho(filename, data) sym_dict = {} @@ -95,16 +105,23 @@ def create_sym_hash(filename=None, data=None): if entity.magic_str != 'Universal': dylib_list = get_dylib_list(entity) sym_list = get_import_symbol_list(entity, dylib_list) - # print(','.join(sorted(sym_list)).encode()) - # print("Number of symbols: {}".format(len(sym_list))) - symhash = md5(','.join(sorted(sym_list)).encode()).hexdigest() + entity_string = "{} {} {}".format(entity.cpu_type_str, entity.filetype_str, entity.magic_str) - sym_dict[entity_string] = symhash + # Order of APIs in symbol table + if hash_mode & HashMode.SYMTAB: + symhash = md5(','.join(sym_list).encode()).hexdigest() + sym_dict[entity_string] = symhash + + # Sort APIs into alphabetical order + if hash_mode & HashMode.SORT: + symhash_sorted = md5(','.join(sorted(sym_list)).encode()).hexdigest() + entity_string += " (Sorted APIs)" + sym_dict[entity_string] = symhash_sorted return sym_dict -def create_sym_fuzzyhash(filename=None, data=None): +def create_sym_fuzzyhash(filename=None, data=None, hash_mode=HashMode.ALL): macho_parser = parse_macho(filename, data) sym_fuzzy_dict = {} @@ -112,10 +129,17 @@ def create_sym_fuzzyhash(filename=None, data=None): if entity.magic_str != 'Universal': dylib_list = get_dylib_list(entity) sym_list = get_import_symbol_list(entity, dylib_list) - # print(','.join(sorted(sym_list)).encode()) - # print("Number of symbols: {}".format(len(sym_list))) - symfuzzyhash = ssdeep.hash(','.join(sorted(sym_list)).encode()) + entity_string = "{} {} {}".format(entity.cpu_type_str, entity.filetype_str, entity.magic_str) - sym_fuzzy_dict[entity_string] = symfuzzyhash + # Order of APIs in symbol table + if hash_mode & HashMode.SYMTAB: + symfuzzyhash = ssdeep.hash(','.join(sym_list).encode()) + sym_fuzzy_dict[entity_string] = symfuzzyhash + + # Sort APIs into alphabetical order + if hash_mode & HashMode.SORT: + symfuzzyhash_sorted = ssdeep.hash(','.join(sorted(sym_list)).encode()) + entity_string += " (Sorted APIs)" + sym_fuzzy_dict[entity_string] = symfuzzyhash_sorted return sym_fuzzy_dict