diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d96bdda --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +symhash.egg-info/ diff --git a/bin/symfuzzy b/bin/symfuzzy new file mode 100644 index 0000000..0d6344e --- /dev/null +++ b/bin/symfuzzy @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +# +# Fussy hashing for Mach-O symbol table +# This program is inspired by impfuzzy (https://github.com/JPCERTCC/impfuzzy) +# +# Copyright (C) 2022 Minoru Kobayashi (@unkn0wnbit) +# +# This software is released under the MIT License. +# https://opensource.org/licenses/MIT +# + +import argparse +import sys + +import ssdeep +from symhash import HashMode, create_sym_fuzzyhash + + +def main(): + parser = argparse.ArgumentParser(description='SymFuzzy: a program to calculate Fuzzy Hash from symbol table of Mach-O files.') + parser.add_argument('-f', '--file', action='store', type=str, + help='Specify a Mach-O file to calculate Fuzzy Hash.', required=True) + parser.add_argument('-f2', '--file2', action='store', type=str, + help='Specify a Mach-O file to be compared with Fuzzy Hash of "--file"') + parser.add_argument('-m', '--mode', action='store', type=str, default='ALL', + help='Order of APIs for calculation hash, which can be specified "SYMTAB" or "SORT".') + args = parser.parse_args() + + args.mode = args.mode.upper() + if args.mode == 'ALL': + hash_mode = HashMode.ALL + elif args.mode == 'SYMTAB': + hash_mode = HashMode.SYMTAB + elif args.mode == 'SORT': + hash_mode = HashMode.SORT + else: + sys.exit("Error: Unsupported mode = {}".format(args.mode)) + + sym_fuzzyhash = create_sym_fuzzyhash(args.file, hash_mode=hash_mode) + if not sym_fuzzyhash: + return + + if args.file2: + sym_fuzzyhash2 = create_sym_fuzzyhash(args.file2, hash_mode=hash_mode) + if not sym_fuzzyhash2: + return + + for arch, fuzzyhash in sym_fuzzyhash.items(): + try: + fuzzyhash2 = sym_fuzzyhash2.pop(arch) + match_value = ssdeep.compare(fuzzyhash, fuzzyhash2) + print("Binary architecture: {}".format(arch)) + print("{}: {}".format(args.file, fuzzyhash)) + print("{}: {}".format(args.file2, fuzzyhash2)) + print("Match value: {}".format(match_value)) + print("-"*50) + except KeyError: + print("{} does not have an architecture binary for\"{}\"".format(args.file2, arch)) + + if len(sym_fuzzyhash2) > 0: + for arch in sym_fuzzyhash2.keys(): + print("{} does not have an architecture binary for\"{}\"".format(args.file, arch)) + + else: + for arch, fuzzyhash in sym_fuzzyhash.items(): + print("{}: {}".format(arch, fuzzyhash)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/symhash b/bin/symhash index e75da79..fbea052 100755 --- a/bin/symhash +++ b/bin/symhash @@ -13,9 +13,10 @@ # creates a list and hashes those import argparse +import sys from future.utils import iteritems -from symhash import create_sym_hash +from symhash import HashMode, create_sym_hash def main(): @@ -24,15 +25,28 @@ def main(): opt.add_argument( '-f', '--file', help='The file to create a SymHash from', required=True ) + opt.add_argument('-m', '--mode', action='store', type=str, default='ALL', + help='Order of APIs for calculation hash, which can be specified "SYMTAB" or "SORT".') opt.add_argument( '-v', '--verbose', help='Verbose output', required=False, action='store_true' ) options = opt.parse_args() + + options.mode = options.mode.upper() + if options.mode == 'ALL': + hash_mode = HashMode.ALL + elif options.mode == 'SYMTAB': + hash_mode = HashMode.SYMTAB + elif options.mode == 'SORT': + hash_mode = HashMode.SORT + else: + sys.exit("Error: Unsupported mode = {}".format(options.mode)) + f_name = options.file - s = create_sym_hash(f_name) + s = create_sym_hash(f_name, hash_mode=hash_mode) if not s: return diff --git a/requirements.txt b/requirements.txt index 3ee1c3a..0cb895d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ -filemagic==1.6 -future==0.15.2 +python-magic>=0.4.27 +python-magic-bin>=0.4.14 +future>=0.18.2 +ssdeep>=3.4 diff --git a/setup.py b/setup.py index 94d7f61..f25b0bf 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='symhash', - version='0.0.2', + version='0.0.3', url='https://github.com/threatstream/symhash', author='Aaron Shelmire', author_email='aaron.shelmire@anomali.com', @@ -16,16 +16,16 @@ 'bin/symhash' ], install_requires=[ - 'filemagic==1.6', - 'future==0.15.2', + 'python-magic>=0.4.27', + 'python-magic-bin>=0.4.14', + 'future>=0.18.2', + 'ssdeep>=3.4', ], description='Anomali Symhash', classifiers=[ 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', 'Operating System :: POSIX', 'Programming Language :: Other Scripting Engines', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.6', ] ) diff --git a/symhash/__init__.py b/symhash/__init__.py index 02c3d7f..7f2f108 100755 --- a/symhash/__init__.py +++ b/symhash/__init__.py @@ -12,23 +12,35 @@ # symhash walks the symbol table (read: loaded API calls) # creates a list and hashes those -import magic +import os +import sys +from enum import Flag, auto from hashlib import md5 +import magic +import ssdeep + from symhash.machoinfo import MachOEntity, MachOParser, MachOParserError -def create_sym_hash(filename=None, data=None): - # create the sym hash +class HashMode(Flag): + SYMTAB = auto() + SORT = auto() + ALL = SYMTAB | SORT + + +def parse_macho(filename=None, data=None): if filename: - with open(filename, 'rb') as f: - data = f.read() + if os.path.isfile(filename): + with open(filename, 'rb') as f: + data = f.read() + else: + sys.exit("Error: {} is not a file.".format(filename)) if not data: return - with magic.Magic() as m: - filetype = m.id_buffer(data[0:1000]) + filetype = magic.from_buffer(data[0:1024]) if 'Mach-O' not in filetype: print("Data provided is not a valid Mach-O filetype") @@ -42,26 +54,92 @@ def create_sym_hash(filename=None, data=None): print("Error {}".format(e)) return + return macho_parser + + +def get_dylib_name_by_ordinal(dylib_list, library_ordinal, basename_only = False): + if library_ordinal > 0 and library_ordinal <= 253: + if basename_only: + return os.path.basename(dylib_list[library_ordinal - 1]) + else: + return dylib_list[library_ordinal - 1] + elif library_ordinal in (0, 254, 255): # 0 = invalid, 254 = DYNAMIC_LOOKUP_ORDINAL, 255 = EXECUTABLE_ORDINAL + return None + + +def get_dylib_list(entity): + dylib_list = [] + for cmd in entity.cmdlist: + if cmd['cmd'] == MachOEntity.LC_LOAD_DYLIB: + dylib_list.append(cmd['dylib'].decode()) + + return dylib_list + + +def get_import_symbol_list(entity, dylib_list): + sym_list = [] + for cmd in entity.cmdlist: + if cmd['cmd'] == MachOEntity.LC_SYMTAB: + for sym in cmd['symbols']: + if not sym['is_stab']: + if sym['external'] is True: + if sym['n_type'] == '0x00': # 0x00 = N_UNDF + library_ordinal = (sym['n_desc'] >> 8) & 0xff + if library_ordinal > 0: + dylib_name = get_dylib_name_by_ordinal(dylib_list, library_ordinal) + if dylib_name: + sym_list.append("{}.{}".format(dylib_name, sym.get('string', '').decode())) + # print("{}\t{}".format(dylib_name, sym.get('string', '').decode())) + + # print(','.join(sorted(sym_list)).encode()) + # print("Number of symbols: {}".format(len(sym_list))) + + return sym_list + + +def create_sym_hash(filename=None, data=None, hash_mode=HashMode.ALL): + macho_parser = parse_macho(filename, data) sym_dict = {} for entity in macho_parser.entities: if entity.magic_str != 'Universal': + dylib_list = get_dylib_list(entity) + sym_list = get_import_symbol_list(entity, dylib_list) - entity_string = "{} {} {}".format(entity.cpu_type_str, - entity.filetype_str, - entity.magic_str) + entity_string = "{} {} {}".format(entity.cpu_type_str, entity.filetype_str, entity.magic_str) + # Order of APIs in symbol table + if hash_mode & HashMode.SYMTAB: + symhash = md5(','.join(sym_list).encode()).hexdigest() + sym_dict[entity_string] = symhash - sym_list = [] + # Sort APIs into alphabetical order + if hash_mode & HashMode.SORT: + symhash_sorted = md5(','.join(sorted(sym_list)).encode()).hexdigest() + entity_string += " (Sorted APIs)" + sym_dict[entity_string] = symhash_sorted - for cmd in entity.cmdlist: - if cmd['cmd'] == MachOEntity.LC_SYMTAB: - for sym in cmd['symbols']: - if not sym['is_stab']: - if sym['external'] is True: - if sym['n_type'] == '0x00': - sym_list.append(sym.get('string', '').decode()) + return sym_dict - symhash = md5(','.join(sorted(sym_list)).encode()).hexdigest() - sym_dict[entity_string] = symhash - return sym_dict +def create_sym_fuzzyhash(filename=None, data=None, hash_mode=HashMode.ALL): + macho_parser = parse_macho(filename, data) + sym_fuzzy_dict = {} + + for entity in macho_parser.entities: + if entity.magic_str != 'Universal': + dylib_list = get_dylib_list(entity) + sym_list = get_import_symbol_list(entity, dylib_list) + + entity_string = "{} {} {}".format(entity.cpu_type_str, entity.filetype_str, entity.magic_str) + # Order of APIs in symbol table + if hash_mode & HashMode.SYMTAB: + symfuzzyhash = ssdeep.hash(','.join(sym_list).encode()) + sym_fuzzy_dict[entity_string] = symfuzzyhash + + # Sort APIs into alphabetical order + if hash_mode & HashMode.SORT: + symfuzzyhash_sorted = ssdeep.hash(','.join(sorted(sym_list)).encode()) + entity_string += " (Sorted APIs)" + sym_fuzzy_dict[entity_string] = symfuzzyhash_sorted + + return sym_fuzzy_dict diff --git a/symhash/machoinfo.py b/symhash/machoinfo.py index 158dfd2..6cbb9b7 100644 --- a/symhash/machoinfo.py +++ b/symhash/machoinfo.py @@ -39,12 +39,12 @@ # XXX: There are a lot of comments indicating we should check we aren't # parsing past the end of a slice. These should all be fixed. ;) -import struct import binascii - -from hashlib import md5 +import struct from builtins import range from datetime import datetime +from hashlib import md5 + from future.utils import iteritems @@ -63,11 +63,14 @@ class MachOEntity(object): # CPU Types (not complete) CPU_ARCH_ABI64 = 0x01000000 + CPU_ARCH_ABI64_32 = 0x02000000 CPU_TYPE_POWERPC = 0x00000012 CPU_TYPE_X86 = 0x00000007 CPU_TYPE_ARM = 0x0000000C CPU_TYPE_POWERPC64 = CPU_TYPE_POWERPC | CPU_ARCH_ABI64 CPU_TYPE_X86_64 = CPU_TYPE_X86 | CPU_ARCH_ABI64 + CPU_TYPE_ARM64 = CPU_TYPE_ARM | CPU_ARCH_ABI64 + CPU_TYPE_ARM64_32 = CPU_TYPE_ARM | CPU_ARCH_ABI64_32 # CPU Subtypes (not complete) CPU_SUBTYPE_MASK = 0xFF000000 @@ -83,6 +86,11 @@ class MachOEntity(object): CPU_SUBTYPE_ARM_V7 = 0x00000009 CPU_SUBTYPE_ARM_V7F = 0x0000000A CPU_SUBTYPE_ARM_V7K = 0x0000000C + CPU_SUBTYPE_ARM64_ALL = 0x00000000 + CPU_SUBTYPE_ARM64_V8 = 0x00000001 + CPU_SUBTYPE_ARM64E = 0x00000002 + CPU_SUBTYPE_ARM64_32_ALL = 0x00000000 + CPU_SUBTYPE_ARM64_32_V8 = 0x00000001 # Filetype MH_OBJECT = 0x00000001 @@ -312,7 +320,8 @@ def __init__(self): self.CPU_TYPE_X86: 'Intel', self.CPU_TYPE_POWERPC64: 'PPC64', self.CPU_TYPE_X86_64: 'Intel (64-bit)', - self.CPU_TYPE_ARM: 'ARM' + self.CPU_TYPE_ARM: 'ARM', + self.CPU_TYPE_ARM64: 'ARM64' } # CPU subtype mapping @@ -614,7 +623,7 @@ def cpu_subtype_str(self): return self.cpu_ppc_subtypes.get(self.cpu_subtype & ~self.CPU_SUBTYPE_MASK, "0x%08x" % self.cpu_subtype) elif self.cpu_type in [self.CPU_TYPE_X86, self.CPU_TYPE_X86_64]: return self.cpu_x86_subtypes.get(self.cpu_subtype & ~self.CPU_SUBTYPE_MASK, "0x%08x" % self.cpu_subtype) - elif self.cpu_type in [self.CPU_TYPE_ARM]: + elif self.cpu_type in [self.CPU_TYPE_ARM, self.CPU_TYPE_ARM64]: return self.cpu_arm_subtypes.get(self.cpu_subtype & ~self.CPU_SUBTYPE_MASK, "0x%08x" % self.cpu_subtype) else: return "0x%08x" % self.cpu_subtype @@ -1017,6 +1026,10 @@ def parse_lc_symtab_sub(self, cmd_dict, data): else: sym['external'] = False + sym['n_sect'] = n_sect + sym['n_desc'] = n_desc + sym['n_value'] = n_value + symbols.append(sym) ptr = ptr[nlist_size:]