From 9180e1975e54af9fffa1cdb0d4124d6b52c92b61 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Mon, 2 Oct 2023 12:37:52 +0100 Subject: [PATCH 01/59] TRIVIAL - added req files and fix bug --- .idea/.gitignore | 8 ++++++++ CODEOWNERS | 32 ++++++++++++++++++++++++++++++++ Dockerfile | 13 +++++++++++++ README.md | 2 +- asm2vec/version.py | 2 ++ catalog-info.yaml | 15 +++++++++++++++ setup.py | 30 +++++++++++++++++++++++------- 7 files changed, 94 insertions(+), 8 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 CODEOWNERS create mode 100644 Dockerfile create mode 100644 asm2vec/version.py create mode 100644 catalog-info.yaml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000..446aa21 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,32 @@ +# This is a comment. +# Each line is a file pattern followed by one or more owners. + +# These owners will be the default owners for everything in +# the repo. Unless a later match takes precedence, +# @global-owner1 and @global-owner2 will be requested for +# review when someone opens a pull request. +* @wandera/datascience + +# Order is important; the last matching pattern takes the most +# precedence. When someone opens a pull request that only +# modifies JS files, only @js-owner and not the global +# owner(s) will be requested for a review. +# *.js @js-owner + +# You can also use email addresses if you prefer. They'll be +# used to look up users just like we do for commit author +# emails. +#*.go docs@example.com + +# The `docs/*` pattern will match files like +# `docs/getting-started.md` but not further nested files like +# `docs/build-app/troubleshooting.md`. +# docs/* docs@example.com + +# In this example, @octocat owns any file in an apps directory +# anywhere in your repository. +# apps/ @octocat + +# In this example, @doctocat owns any file in the `/docs` +# directory in the root of your repository. +# /docs/ @doctocat \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cb6efa5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.10.11-slim + +ADD . /asm2vec-pytorch +WORKDIR asm2vec-pytorch + +RUN apt-get update && apt-get install -y --no-install-recommends \ + unixodbc-dev \ + unixodbc \ + libpq-dev && \ + pip install -r requirements.txt && \ + python setup.py install + +CMD ["/bin/sh"] diff --git a/README.md b/README.md index 7a2043b..c5fc4ae 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The details of the model can be found in the original paper: [(sp'19) Asm2Vec: B ## Requirements -python >= 3.6 +python >= 3.10 | packages | for | | --- | --- | diff --git a/asm2vec/version.py b/asm2vec/version.py new file mode 100644 index 0000000..f8e7582 --- /dev/null +++ b/asm2vec/version.py @@ -0,0 +1,2 @@ +VERSION = '1.0.0' +DEV_VERSION = '0' diff --git a/catalog-info.yaml b/catalog-info.yaml new file mode 100644 index 0000000..378ab88 --- /dev/null +++ b/catalog-info.yaml @@ -0,0 +1,15 @@ +apiVersion: backstage.io/v1alpha1 +kind: Component +metadata: + name: asm2vec-pytorch + description: All code running ASM2VEC using PyTorch + labels: + - jira-key: DATASCI + - language: Python + annotations: + backstage.io/source-location: url:https://github.com/wandera/asm2vec-pytorch +spec: + type: service + lifecycle: production + owner: datascience + system: datascience diff --git a/setup.py b/setup.py index 62ff843..be492bc 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,30 @@ from setuptools import setup, find_packages +from asm2vec.version import VERSION + + +def readme(): + with open('README.md') as f: + return f.read() + + +def read_requirements(): + with open('requirements.txt') as f: + return [s for s in f.read().split('\n') if not ('--index-url' in s)] + + setup( name='asm2vec', - version='1.0.0', - description='Unofficial implementation of asm2vec using pytorch', - install_requires=['torch>=1.7,<2' - 'click>=7.1,<8' - 'r2pipe>=1.5,<2'], - author='oalieno', - author_email='jeffrey6910@gmail.com', + version=VERSION, + description="Jamf's implementation of asm2vec using pytorch", + long_description=readme(), + author='oalieno/jamf', + author_email='jamie.nutter@jamf.com', license='MIT License', + install_requires=read_requirements(), packages = find_packages(), + zip_safe=False, + include_package_data=True, + test_suite='nose.collector', + tests_require=['nose'] ) From 9b8decbcad77cf9230348c7c44fe0f0d43d1f640 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Mon, 2 Oct 2023 13:10:16 +0100 Subject: [PATCH 02/59] Create SECURITY.md --- SECURITY.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..c478391 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,26 @@ +Thanks for helping make GitHub safe for everyone. + +# Security + +Jamf takes the security of our software products and services seriously, including all of the open source code repositories managed through our GitHub organizations, such as asm2vec-pytorch. + +We will ensure that your finding gets passed along to the appropriate maintainers for remediation. + +# Reporting Security Issues + +If you believe you have found a security vulnerability in any Jamf-owned repository, please report it to us through coordinated disclosure. + +Please do not report security vulnerabilities through public GitHub issues, discussions, or pull requests. + +Instead, please send an email to info[@]jamf.com. + +Please include as much of the information listed below as you can to help us better understand and resolve the issue: +- The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting) +- Full paths of source file(s) related to the manifestation of the issue +- The location of the affected source code (tag/branch/commit or direct URL) +- Any special configuration required to reproduce the issue +- Step-by-step instructions to reproduce the issue +- Proof-of-concept or exploit code (if possible) +- Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. From 7e659f69de9b16a863e19621e2c76ea84b87e437 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Mon, 2 Oct 2023 13:40:32 +0100 Subject: [PATCH 03/59] TRIVIAL - init --- asm2vec/__init__.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py index 0962ef8..ae7efea 100644 --- a/asm2vec/__init__.py +++ b/asm2vec/__init__.py @@ -1,6 +1 @@ -import importlib - -__all__ = ['model', 'datatype', 'utils'] - -for module in __all__: - importlib.import_module(f'.{module}', 'asm2vec') +__all__ = ["datatype", "model", "utils", "version"] From 20df9cceb0c491150764b1096fd0c44cd134745d Mon Sep 17 00:00:00 2001 From: "CI2.0" Date: Mon, 2 Oct 2023 12:43:46 +0000 Subject: [PATCH 04/59] [Jenkins] Set version to 1.0.1 --- asm2vec/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/version.py b/asm2vec/version.py index f8e7582..f1ae280 100644 --- a/asm2vec/version.py +++ b/asm2vec/version.py @@ -1,2 +1,2 @@ -VERSION = '1.0.0' +VERSION = '1.0.1' DEV_VERSION = '0' From 5be2ef8eaaaeff22a2a07e84e4821ffdbd8e71c3 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 15:29:39 +0200 Subject: [PATCH 05/59] AEGIS-6405 datatype PEP8 --- asm2vec/datatype.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/asm2vec/datatype.py b/asm2vec/datatype.py index a3cd39b..b6451d8 100644 --- a/asm2vec/datatype.py +++ b/asm2vec/datatype.py @@ -2,19 +2,23 @@ import random import warnings + class Token: def __init__(self, name, index): self.name = name self.index = index self.count = 1 + def __str__(self): return self.name + class Tokens: def __init__(self, name_to_index=None, tokens=None): self.name_to_index = name_to_index or {} self.tokens = tokens or [] self._weights = None + def __getitem__(self, key): if type(key) is str: if self.name_to_index.get(key) is None: @@ -28,13 +32,17 @@ def __getitem__(self, key): return [self[k] for k in key] except: raise ValueError + def load_state_dict(self, sd): self.name_to_index = sd['name_to_index'] self.tokens = sd['tokens'] + def state_dict(self): return {'name_to_index': self.name_to_index, 'tokens': self.tokens} + def size(self): return len(self.tokens) + def add(self, names): self._weights = None if type(names) is not list: @@ -46,6 +54,7 @@ def add(self, names): self.tokens.append(token) else: self.tokens[self.name_to_index[name]].count += 1 + def update(self, tokens_new): for token in tokens_new: if token.name not in self.name_to_index: @@ -54,6 +63,7 @@ def update(self, tokens_new): self.tokens.append(token) else: self.tokens[self.name_to_index[token.name]].count += token.count + def weights(self): # if no cache, calculate if self._weights is None: @@ -62,19 +72,22 @@ def weights(self): for token in self.tokens: self._weights[token.index] = (token.count / total) ** 0.75 return self._weights + def sample(self, batch_size, num=5): return torch.multinomial(self.weights(), num * batch_size, replacement=True).view(batch_size, num) + class Function: def __init__(self, insts, blocks, meta): self.insts = insts self.blocks = blocks self.meta = meta + @classmethod def load(cls, text): - ''' - gcc -S format compatiable - ''' + """gcc -S format compatible + """ + label, labels, insts, blocks, meta = None, {}, [], [], {} for line in text.strip('\n').split('\n'): if line[0] in [' ', '\t']: @@ -109,10 +122,13 @@ def load(cls, text): if labels.get(arg): inst.args[i] = 'CONST' return cls(insts, blocks, meta) + def tokens(self): return [token for inst in self.insts for token in inst.tokens()] + def random_walk(self, num=3): return [self._random_walk() for _ in range(num)] + def _random_walk(self): current, visited, seq = self.blocks[0], [], [] while current not in visited: @@ -124,25 +140,31 @@ def _random_walk(self): current = random.choice(list(current.successors)) return seq + class BasicBlock: def __init__(self): self.insts = [] self.successors = set() + def add(self, inst): self.insts.append(inst) + def end(self): inst = self.insts[-1] return inst.is_jmp() or inst.op == 'ret' + class Instruction: def __init__(self, op, args): self.op = op self.args = args + def __str__(self): return f'{self.op} {", ".join([str(arg) for arg in self.args if str(arg)])}' + @classmethod def load(cls, text): - text = text.strip().strip('bnd').strip() # get rid of BND prefix + text = text.strip().strip('bnd').strip() op, _, args = text.strip().partition(' ') if args: args = [arg.strip() for arg in args.split(',')] @@ -150,9 +172,12 @@ def load(cls, text): args = [] args = (args + ['', ''])[:2] return cls(op, args) + def tokens(self): return [self.op] + self.args + def is_jmp(self): return 'jmp' in self.op or self.op[0] == 'j' + def is_call(self): return self.op == 'call' From 2833c60d6c1323bf0bebba0e99ee39282dc3204a Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 15:30:43 +0200 Subject: [PATCH 06/59] AEGIS-6405 PEP8 model.py --- asm2vec/model.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/asm2vec/model.py b/asm2vec/model.py index 301f3be..74a6ace 100644 --- a/asm2vec/model.py +++ b/asm2vec/model.py @@ -3,35 +3,43 @@ bce, sigmoid, softmax = nn.BCELoss(), nn.Sigmoid(), nn.Softmax(dim=1) + class ASM2VEC(nn.Module): def __init__(self, vocab_size, function_size, embedding_size): super(ASM2VEC, self).__init__() - self.embeddings = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size)) - self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size, _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2) - self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size, _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2) + self.embeddings = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size)) + self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size, + _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2) + self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size, + _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2) def update(self, function_size_new, vocab_size_new): device = self.embeddings.weight.device - vocab_size, function_size, embedding_size = self.embeddings.num_embeddings, self.embeddings_f.num_embeddings, self.embeddings.embedding_dim + vocab_size, function_size, embedding_size = (self.embeddings.num_embeddings, + self.embeddings_f.num_embeddings, self.embeddings.embedding_dim) if vocab_size_new != vocab_size: - weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size).to(device)]) + weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size). + to(device)]) self.embeddings = nn.Embedding(vocab_size_new, embedding_size, _weight=weight) - weight_r = torch.cat([self.embeddings_r.weight, ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2).to(device)]) + weight_r = torch.cat([self.embeddings_r.weight, + ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2) + .to(device)]) self.embeddings_r = nn.Embedding(vocab_size_new, 2 * embedding_size, _weight=weight_r) - self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size, _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5)/embedding_size/2).to(device)) + self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size, + _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5) / + embedding_size/2).to(device)) def v(self, inp): - e = self.embeddings(inp[:,1:]) - v_f = self.embeddings_f(inp[:,0]) - v_prev = torch.cat([e[:,0], (e[:,1] + e[:,2]) / 2], dim=1) - v_next = torch.cat([e[:,3], (e[:,4] + e[:,5]) / 2], dim=1) + e = self.embeddings(inp[:, 1:]) + v_f = self.embeddings_f(inp[:, 0]) + v_prev = torch.cat([e[:, 0], (e[:, 1] + e[:, 2]) / 2], dim=1) + v_next = torch.cat([e[:, 3], (e[:, 4] + e[:, 5]) / 2], dim=1) v = ((v_f + v_prev + v_next) / 3).unsqueeze(2) return v def forward(self, inp, pos, neg): device, batch_size = inp.device, inp.shape[0] v = self.v(inp) - # negative sampling loss pred = torch.bmm(self.embeddings_r(torch.cat([pos, neg], dim=1)), v).squeeze() label = torch.cat([torch.ones(batch_size, 3), torch.zeros(batch_size, neg.shape[1])], dim=1).to(device) return bce(sigmoid(pred), label) @@ -39,5 +47,6 @@ def forward(self, inp, pos, neg): def predict(self, inp, pos): device, batch_size = inp.device, inp.shape[0] v = self.v(inp) - probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1).to(device)), v).squeeze(dim=2) + probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1). + to(device)), v).squeeze(dim=2) return softmax(probs) From 5d0353408a7c1bf856da24c4a2021cd5a4818bd0 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 15:31:34 +0200 Subject: [PATCH 07/59] AEGIS-6405 PEP8 utils.py --- asm2vec/utils.py | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/asm2vec/utils.py b/asm2vec/utils.py index 4f9aa25..6c5b539 100644 --- a/asm2vec/utils.py +++ b/asm2vec/utils.py @@ -3,18 +3,22 @@ import torch from torch.utils.data import DataLoader, Dataset from pathlib import Path -from .datatype import Tokens, Function, Instruction -from .model import ASM2VEC +from datatype import Tokens, Function, Instruction +from model import ASM2VEC + class AsmDataset(Dataset): def __init__(self, x, y): self.x = x self.y = y + def __len__(self): return len(self.x) + def __getitem__(self, index): return self.x[index], self.y[index] + def load_data(paths, limit=None): if type(paths) is not list: paths = [paths] @@ -22,7 +26,8 @@ def load_data(paths, limit=None): filenames = [] for path in paths: if os.path.isdir(path): - filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) if os.path.isfile(Path(path) / filename)] + filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) + if os.path.isfile(Path(path) / filename)] else: filenames += [Path(path)] @@ -37,6 +42,7 @@ def load_data(paths, limit=None): return functions, tokens + def preprocess(functions, tokens): x, y = [], [] for i, fn in enumerate(functions): @@ -46,6 +52,7 @@ def preprocess(functions, tokens): y.append([tokens[token].index for token in seq[j].tokens()]) return torch.tensor(x), torch.tensor(y) + def train( functions, tokens, @@ -102,6 +109,7 @@ def train( return model + def save_model(path, model, tokens): torch.save({ 'model_params': ( @@ -113,6 +121,7 @@ def save_model(path, model, tokens): 'tokens': tokens.state_dict(), }, path) + def load_model(path, device='cpu'): checkpoint = torch.load(path, map_location=device) tokens = Tokens() @@ -122,35 +131,37 @@ def load_model(path, device='cpu'): model = model.to(device) return model, tokens + def show_probs(x, y, probs, tokens, limit=None, pretty=False): if pretty: - TL, TR, BL, BR = '┌', '┐', '└', '┘' - LM, RM, TM, BM = '├', '┤', '┬', '┴' - H, V = '─', '│' + tl, tr, bl, br = '┌', '┐', '└', '┘' + lm, rm, tm, bm = '├', '┤', '┬', '┴' + h, v = '─', '│' arrow = ' ➔' else: - TL = TR = BL = BR = '+' - LM = RM = TM = BM = '+' - H, V = '-', '|' + tl, tr, bl, br = '+', '+', '+', '+' + lm, rm, tm, bm = '+', '+', '+', '+' + h, v = '-', '|' arrow = '->' top = probs.topk(5) for i, (xi, yi) in enumerate(zip(x, y)): if limit and i >= limit: break xi, yi = xi.tolist(), yi.tolist() - print(TL + H * 42 + TR) - print(f'{V} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {V}') - print(f'{V} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {V}') - print(f'{V} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {V}') - print(LM + H * 8 + TM + H * 33 + RM) + print(tl + h * 42 + tr) + print(f'{v} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}') + print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}') + print(f'{v} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}') + print(lm + h * 8 + tm + h * 33 + rm) for value, index in zip(top.values[i], top.indices[i]): if index in yi: colorbegin, colorclear = '\033[92m', '\033[0m' else: colorbegin, colorclear = '', '' - print(f'{V} {colorbegin}{value*100:05.2f}%{colorclear} {V} {colorbegin}{tokens[index.item()].name:31}{colorclear} {V}') - print(BL + H * 8 + BM + H * 33 + BR) + print(f'{v} {colorbegin}{value*100:05.2f}%{colorclear} {v} {colorbegin}' + f'{tokens[index.item()].name:31}{colorclear} {v}') + print(bl + h * 8 + bm + h * 33 + br) + def accuracy(y, probs): return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)])) - From 62bafc3b6bce937124c0223465ab66071253f64c Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 16:33:45 +0200 Subject: [PATCH 08/59] AEGIS-6405 Create binary_to_assembly.py --- asm2vec/disassembling.py | 148 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 asm2vec/disassembling.py diff --git a/asm2vec/disassembling.py b/asm2vec/disassembling.py new file mode 100644 index 0000000..87158c5 --- /dev/null +++ b/asm2vec/disassembling.py @@ -0,0 +1,148 @@ +import re +import os +import hashlib +import r2pipe +import logging +from pathlib import Path + + +class BinaryToAsm: + + def __init__(self, input_path: str, output_path: str) -> None: + """Disassembles the newly collected malware files + :param input_path: the path to the malware binaries + :param output_path: the path for the assembly functions to be extracted + """ + self.binary_dir = Path(input_path) + self.asm_dir = Path(output_path) + + @staticmethod + def _sha3(asm: str) -> str: + """Produces SHA3 for each assembly function + :param asm: input assembly function + """ + return hashlib.sha3_256(asm.encode()).hexdigest() + + @staticmethod + def _valid_exe(filename: str) -> bool: + """Extracts magic bytes and returns the header + :param filename: name of the malware file (SHA1) + :return: Boolean of the header existing in magic bytes + """ + magics = [bytes.fromhex('cffaedfe')] + with open(filename, 'rb') as f: + header = f.read(4) + return header in magics + + @staticmethod + def _normalize(opcode: str) -> str: + """ Normalizes the input string + :param opcode: opcode of the binary + """ + opcode = opcode.replace(' - ', ' + ') + opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode) + opcode = re.sub(r'\*[0-9]', '*CONST', opcode) + opcode = re.sub(r' [0-9]', ' CONST', opcode) + return opcode + + def _fn_to_asm(self, pdf: dict | None, asm_minlen: int) -> str: + """Converts functions to assembly code + :param pdf: disassembly + :param asm_minlen: minimum length of assembly functions to be extracted + """ + if pdf is None: + return '' + if len(pdf['ops']) < asm_minlen: + return '' + if 'invalid' in [op['type'] for op in pdf['ops']]: + return '' + + ops = pdf['ops'] + + labels, scope = {}, [op['offset'] for op in ops] + assert (None not in scope) + for i, op in enumerate(ops): + if op.get('jump') in scope: + labels.setdefault(op.get('jump'), i) + + output = '' + for op in ops: + if labels.get(op.get('offset')) is not None: + output += f'LABEL{labels[op["offset"]]}:\n' + if labels.get(op.get('jump')) is not None: + output += f' {op["type"]} LABEL{labels[op["jump"]]}\n' + else: + output += f' {self._normalize(op["opcode"])}\n' + + return output + + def bin_to_asm(self, filename: Path, output_path: Path, asm_minlen: int) -> int: + """Fragments the input binary into assembly functions via r2pipe + :param filename: name of the malware file (SHA1) + :param output_path: path to the folder to store the assembly functions for each malware + :param asm_minlen: the minimum length of assembly functions to be extracted + :return: the number of assembly functions + """ + if not self._valid_exe(filename): + logging.info('The input file is invalid.') + return 0 + + r = r2pipe.open(str(filename)) + r.cmd('aaaa') + + count = 0 + + for fn in r.cmdj('aflj'): + r.cmd(f's {fn["offset"]}') + asm = self._fn_to_asm(r.cmdj('pdfj'), asm_minlen) + if asm: + uid = self._sha3(asm) + asm = f''' .name {fn["name"]}\ + .offset {fn["offset"]:016x}\ + .file {filename.name}''' + asm + output_asm = os.path.join(output_path, uid) + with open(output_asm, 'w') as file: + file.write(asm) + count += 1 + return count + + def convert_to_asm(self, minlen_upper: int, minlen_lower: int) -> list: + """ Extracts assembly functions from malware files and saves them + into separate folder per binary + :param minlen_upper: The minimum number of assembly functions needed for disassembling + :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number + of assembly functions to minlen_lower + :return: List of sha1 of disassembled malware files + """ + + if not os.path.exists(self.asm_dir): + os.mkdir(self.asm_dir) + + function_count, binary_count, not_found = 0, 0, 0 + disassembled_bins = [] + + if os.path.isdir(self.binary_dir): + for entry in os.scandir(self.binary_dir): + out_dir = os.path.join(self.asm_dir, entry.name) + if not (os.path.exists(out_dir)): + os.mkdir(out_dir) + function_count += self.bin_to_asm(Path(entry), Path(out_dir), minlen_upper) + if function_count == 0: + function_count += self.bin_to_asm(Path(entry), Path(out_dir), minlen_lower) + if function_count == 0: + os.rmdir(out_dir) + logging.info('The binary {} was not disassembled'.format(entry.name)) + else: + binary_count += 1 + disassembled_bins.append(entry.name) + else: + binary_count += 1 + disassembled_bins.append(entry.name) + else: + not_found += 1 + logging.info("[Error] No such file or directory: {}".format(self.binary_dir)) + + logging.info("Total scanned binaries: {}".format(binary_count)) + logging.info("Not converted binaries: {}".format(not_found)) + + return disassembled_bins From 5dea44335b0ca4ae9b1359fee005806e385e0500 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 16:35:25 +0200 Subject: [PATCH 09/59] AEGIS-6405 Delete scripts/bin2asm.py scripts/bin2asm.py to be replaced with asm2vec/binary_to_asm.py --- scripts/bin2asm.py | 117 --------------------------------------------- 1 file changed, 117 deletions(-) delete mode 100644 scripts/bin2asm.py diff --git a/scripts/bin2asm.py b/scripts/bin2asm.py deleted file mode 100644 index 2134e8c..0000000 --- a/scripts/bin2asm.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 -import re -import os -import click -import r2pipe -import hashlib -from pathlib import Path - -def sha3(data): - return hashlib.sha3_256(data.encode()).hexdigest() - -def validEXE(filename): - magics = [bytes.fromhex('7f454c46')] - with open(filename, 'rb') as f: - header = f.read(4) - return header in magics - -def normalize(opcode): - opcode = opcode.replace(' - ', ' + ') - opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode) - opcode = re.sub(r'\*[0-9]', '*CONST', opcode) - opcode = re.sub(r' [0-9]', ' CONST', opcode) - return opcode - -def fn2asm(pdf, minlen): - # check - if pdf is None: - return - if len(pdf['ops']) < minlen: - return - if 'invalid' in [op['type'] for op in pdf['ops']]: - return - - ops = pdf['ops'] - - # set label - labels, scope = {}, [op['offset'] for op in ops] - assert(None not in scope) - for i, op in enumerate(ops): - if op.get('jump') in scope: - labels.setdefault(op.get('jump'), i) - - # dump output - output = '' - for op in ops: - # add label - if labels.get(op.get('offset')) is not None: - output += f'LABEL{labels[op["offset"]]}:\n' - # add instruction - if labels.get(op.get('jump')) is not None: - output += f' {op["type"]} LABEL{labels[op["jump"]]}\n' - else: - output += f' {normalize(op["opcode"])}\n' - - return output - -def bin2asm(filename, opath, minlen): - # check - if not validEXE(filename): - return 0 - - r = r2pipe.open(str(filename)) - r.cmd('aaaa') - - count = 0 - - for fn in r.cmdj('aflj'): - r.cmd(f's {fn["offset"]}') - asm = fn2asm(r.cmdj('pdfj'), minlen) - if asm: - uid = sha3(asm) - asm = f''' .name {fn["name"]} - .offset {fn["offset"]:016x} - .file {filename.name} -''' + asm - with open(opath / uid, 'w') as f: - f.write(asm) - count += 1 - - print(f'[+] {filename}') - - return count - -@click.command() -@click.option('-i', '--input', 'ipath', help='input directory / file', required=True) -@click.option('-o', '--output', 'opath', default='asm', help='output directory') -@click.option('-l', '--len', 'minlen', default=10, help='ignore assembly code with instructions amount smaller than minlen') -def cli(ipath, opath, minlen): - ''' - Extract assembly functions from binary executable - ''' - ipath = Path(ipath) - opath = Path(opath) - - # create output directory - if not os.path.exists(opath): - os.mkdir(opath) - - fcount, bcount = 0, 0 - - # directory - if os.path.isdir(ipath): - for f in os.listdir(ipath): - if not os.path.islink(ipath / f) and not os.path.isdir(ipath / f): - fcount += bin2asm(ipath / f, opath, minlen) - bcount += 1 - # file - elif os.path.exists(ipath): - fcount += bin2asm(ipath, opath, minlen) - bcount += 1 - else: - print(f'[Error] No such file or directory: {ipath}') - - print(f'[+] Total scan binary: {bcount} => Total generated assembly functions: {fcount}') - -if __name__ == '__main__': - cli() From 988f430156afadb153324b6e5ed41ea23e02d7ff Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 16:36:53 +0200 Subject: [PATCH 10/59] AEGIS-6405 Rename disassembling.py to binary_to_asm.py --- asm2vec/{disassembling.py => binary_to_asm.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename asm2vec/{disassembling.py => binary_to_asm.py} (100%) diff --git a/asm2vec/disassembling.py b/asm2vec/binary_to_asm.py similarity index 100% rename from asm2vec/disassembling.py rename to asm2vec/binary_to_asm.py From ec58db1cabddc36f6fc193a5ae01fb2007ec067b Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 17:01:09 +0200 Subject: [PATCH 11/59] AEGIS-6405 Update __init__.py --- asm2vec/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py index ae7efea..d3afa2c 100644 --- a/asm2vec/__init__.py +++ b/asm2vec/__init__.py @@ -1 +1 @@ -__all__ = ["datatype", "model", "utils", "version"] +__all__ = ["datatype", "model", "utils", "binary_to_asm", "version"] From 18cd90c59df8faddca0ca537ee51e98eeb812a15 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 17:14:29 +0200 Subject: [PATCH 12/59] AEGIS-6405 Update asm2vec/utils.py - JN review Co-authored-by: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> --- asm2vec/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asm2vec/utils.py b/asm2vec/utils.py index 6c5b539..b233d33 100644 --- a/asm2vec/utils.py +++ b/asm2vec/utils.py @@ -3,8 +3,8 @@ import torch from torch.utils.data import DataLoader, Dataset from pathlib import Path -from datatype import Tokens, Function, Instruction -from model import ASM2VEC +from asm2vec.datatype import Tokens, Function, Instruction +from asm2vec.model import ASM2VEC class AsmDataset(Dataset): From 6632b198c84f36ed8cde12b53428ad058834bf9f Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 17:26:56 +0200 Subject: [PATCH 13/59] AEGIS-6405 remove class --- asm2vec/binary_to_asm.py | 265 +++++++++++++++++++-------------------- 1 file changed, 131 insertions(+), 134 deletions(-) diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py index 87158c5..fe30d9a 100644 --- a/asm2vec/binary_to_asm.py +++ b/asm2vec/binary_to_asm.py @@ -6,143 +6,140 @@ from pathlib import Path -class BinaryToAsm: - - def __init__(self, input_path: str, output_path: str) -> None: - """Disassembles the newly collected malware files - :param input_path: the path to the malware binaries - :param output_path: the path for the assembly functions to be extracted - """ - self.binary_dir = Path(input_path) - self.asm_dir = Path(output_path) - - @staticmethod - def _sha3(asm: str) -> str: - """Produces SHA3 for each assembly function - :param asm: input assembly function - """ - return hashlib.sha3_256(asm.encode()).hexdigest() - - @staticmethod - def _valid_exe(filename: str) -> bool: - """Extracts magic bytes and returns the header - :param filename: name of the malware file (SHA1) - :return: Boolean of the header existing in magic bytes - """ - magics = [bytes.fromhex('cffaedfe')] - with open(filename, 'rb') as f: - header = f.read(4) - return header in magics - - @staticmethod - def _normalize(opcode: str) -> str: - """ Normalizes the input string - :param opcode: opcode of the binary - """ - opcode = opcode.replace(' - ', ' + ') - opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode) - opcode = re.sub(r'\*[0-9]', '*CONST', opcode) - opcode = re.sub(r' [0-9]', ' CONST', opcode) - return opcode - - def _fn_to_asm(self, pdf: dict | None, asm_minlen: int) -> str: - """Converts functions to assembly code - :param pdf: disassembly - :param asm_minlen: minimum length of assembly functions to be extracted - """ - if pdf is None: - return '' - if len(pdf['ops']) < asm_minlen: - return '' - if 'invalid' in [op['type'] for op in pdf['ops']]: - return '' - - ops = pdf['ops'] - - labels, scope = {}, [op['offset'] for op in ops] - assert (None not in scope) - for i, op in enumerate(ops): - if op.get('jump') in scope: - labels.setdefault(op.get('jump'), i) - - output = '' - for op in ops: - if labels.get(op.get('offset')) is not None: - output += f'LABEL{labels[op["offset"]]}:\n' - if labels.get(op.get('jump')) is not None: - output += f' {op["type"]} LABEL{labels[op["jump"]]}\n' - else: - output += f' {self._normalize(op["opcode"])}\n' - - return output - - def bin_to_asm(self, filename: Path, output_path: Path, asm_minlen: int) -> int: - """Fragments the input binary into assembly functions via r2pipe - :param filename: name of the malware file (SHA1) - :param output_path: path to the folder to store the assembly functions for each malware - :param asm_minlen: the minimum length of assembly functions to be extracted - :return: the number of assembly functions - """ - if not self._valid_exe(filename): - logging.info('The input file is invalid.') - return 0 - - r = r2pipe.open(str(filename)) - r.cmd('aaaa') - - count = 0 - - for fn in r.cmdj('aflj'): - r.cmd(f's {fn["offset"]}') - asm = self._fn_to_asm(r.cmdj('pdfj'), asm_minlen) - if asm: - uid = self._sha3(asm) - asm = f''' .name {fn["name"]}\ - .offset {fn["offset"]:016x}\ - .file {filename.name}''' + asm - output_asm = os.path.join(output_path, uid) - with open(output_asm, 'w') as file: - file.write(asm) - count += 1 - return count - - def convert_to_asm(self, minlen_upper: int, minlen_lower: int) -> list: - """ Extracts assembly functions from malware files and saves them - into separate folder per binary - :param minlen_upper: The minimum number of assembly functions needed for disassembling - :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number - of assembly functions to minlen_lower - :return: List of sha1 of disassembled malware files - """ - - if not os.path.exists(self.asm_dir): - os.mkdir(self.asm_dir) - - function_count, binary_count, not_found = 0, 0, 0 - disassembled_bins = [] - - if os.path.isdir(self.binary_dir): - for entry in os.scandir(self.binary_dir): - out_dir = os.path.join(self.asm_dir, entry.name) - if not (os.path.exists(out_dir)): - os.mkdir(out_dir) - function_count += self.bin_to_asm(Path(entry), Path(out_dir), minlen_upper) +def _sha3(asm: str) -> str: + """Produces SHA3 for each assembly function + :param asm: input assembly function + """ + return hashlib.sha3_256(asm.encode()).hexdigest() + + +def _valid_exe(filename: str) -> bool: + """Extracts magic bytes and returns the header + :param filename: name of the malware file (SHA1) + :return: Boolean of the header existing in magic bytes + """ + magics = [bytes.fromhex('cffaedfe')] + with open(filename, 'rb') as f: + header = f.read(4) + return header in magics + + +def _normalize(opcode: str) -> str: + """ Normalizes the input string + :param opcode: opcode of the binary + """ + opcode = opcode.replace(' - ', ' + ') + opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode) + opcode = re.sub(r'\*[0-9]', '*CONST', opcode) + opcode = re.sub(r' [0-9]', ' CONST', opcode) + return opcode + + +def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str: + """Converts functions to assembly code + :param pdf: disassembly + :param asm_minlen: minimum length of assembly functions to be extracted + """ + if pdf is None: + return '' + if len(pdf['ops']) < asm_minlen: + return '' + if 'invalid' in [op['type'] for op in pdf['ops']]: + return '' + + ops = pdf['ops'] + + labels, scope = {}, [op['offset'] for op in ops] + assert (None not in scope) + for i, op in enumerate(ops): + if op.get('jump') in scope: + labels.setdefault(op.get('jump'), i) + + output = '' + for op in ops: + if labels.get(op.get('offset')) is not None: + output += f'LABEL{labels[op["offset"]]}:\n' + if labels.get(op.get('jump')) is not None: + output += f' {op["type"]} LABEL{labels[op["jump"]]}\n' + else: + output += f' {_normalize(op["opcode"])}\n' + + return output + + +def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int) -> int: + """Fragments the input binary into assembly functions via r2pipe + :param filename: name of the malware file (SHA1) + :param output_path: path to the folder to store the assembly functions for each malware + :param asm_minlen: the minimum length of assembly functions to be extracted + :return: the number of assembly functions + """ + if not _valid_exe(filename): + logging.info('The input file is invalid.') + return 0 + + r = r2pipe.open(str(filename)) + r.cmd('aaaa') + + count = 0 + + for fn in r.cmdj('aflj'): + r.cmd(f's {fn["offset"]}') + asm = _fn_to_asm(r.cmdj('pdfj'), asm_minlen) + if asm: + uid = _sha3(asm) + asm = f''' .name {fn["name"]}\ + .offset {fn["offset"]:016x}\ + .file {filename.name}''' + asm + output_asm = os.path.join(output_path, uid) + with open(output_asm, 'w') as file: + file.write(asm) + count += 1 + return count + + +def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int) -> list: + """ Extracts assembly functions from malware files and saves them + into separate folder per binary + :param input_path: the path to the malware binaries + :param output_path: the path for the assembly functions to be extracted + :param minlen_upper: The minimum number of assembly functions needed for disassembling + :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number + of assembly functions to minlen_lower + :return: List of sha1 of disassembled malware files + """ + + binary_dir = Path(input_path) + asm_dir = Path(output_path) + + if not os.path.exists(asm_dir): + os.mkdir(asm_dir) + + function_count, binary_count, not_found = 0, 0, 0 + disassembled_bins = [] + + if os.path.isdir(binary_dir): + for entry in os.scandir(binary_dir): + out_dir = os.path.join(asm_dir, entry.name) + if not (os.path.exists(out_dir)): + os.mkdir(out_dir) + function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper) + if function_count == 0: + function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower) if function_count == 0: - function_count += self.bin_to_asm(Path(entry), Path(out_dir), minlen_lower) - if function_count == 0: - os.rmdir(out_dir) - logging.info('The binary {} was not disassembled'.format(entry.name)) - else: - binary_count += 1 - disassembled_bins.append(entry.name) + os.rmdir(out_dir) + logging.info('The binary {} was not disassembled'.format(entry.name)) else: binary_count += 1 disassembled_bins.append(entry.name) - else: - not_found += 1 - logging.info("[Error] No such file or directory: {}".format(self.binary_dir)) + else: + binary_count += 1 + disassembled_bins.append(entry.name) + else: + not_found += 1 + logging.info("[Error] No such file or directory: {}".format(binary_dir)) - logging.info("Total scanned binaries: {}".format(binary_count)) - logging.info("Not converted binaries: {}".format(not_found)) + logging.info("Total scanned binaries: {}".format(binary_count)) + logging.info("Not converted binaries: {}".format(not_found)) - return disassembled_bins + return disassembled_bins From 98f9868be39f6e03bb300d2a44edbd67231e17d9 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 23:57:00 +0200 Subject: [PATCH 14/59] AEGIS-6406 Create train.py Script for training an asm2vec model --- asm2vec/train.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ scripts/train.py | 52 -------------------------------- 2 files changed, 77 insertions(+), 52 deletions(-) create mode 100644 asm2vec/train.py delete mode 100644 scripts/train.py diff --git a/asm2vec/train.py b/asm2vec/train.py new file mode 100644 index 0000000..32c11c0 --- /dev/null +++ b/asm2vec/train.py @@ -0,0 +1,77 @@ +import torch +import asm2vec +import logging +from pathlib import Path +from asm2vec import utils + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +def callback(context) -> None: + """Prettifies the display of accuracy, if chosen + """ + progress = f'{context["epoch"]} | time = {context["time"]:.2f},\ + loss = {context["loss"]:.4f}' + + if context["accuracy"]: + progress += f', accuracy = {context["accuracy"]:.4f}' + logging.info(f"{progress}") + + +def train_asm2vec_model( + train_set: str, + new_model: str, + model_path: str | None, + limit: int, + epochs: int, + calc_acc: False, + embedding_size=100, + batch_size=1024, + neg_sample=25, + lr=0.02, + device='cpu', +) -> None: + """Trains an asm2vec model + :param train_set: path to the training dataset + :param new_model: path to the model to be trained + :param model_path: path to already trained model + :param limit: number of the assembly functions that the model will be trained on; + if not defined, all the assembly functions in train_set_path + :param epochs: number of epochs + :param calc_acc: displays the accuracy per training epoch; setting it to True will slow down the training + :param embedding_size: size of the vector representation for a token; an assembly function + will be represented with a vector twice that size + :param batch_size: the size of batches for training + :param neg_sample: negative sampling amount + :param device: 'auto' | 'cuda' | 'cpu' + :param lr: learning rate + """ + + if device == 'auto': + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + if model_path: + model, tokens = asm2vec.utils.load_model(model_path, device=device) + functions, tokens_new = asm2vec.utils.load_data(train_set, limit=limit) + tokens.update(tokens_new) + model.update(len(functions), tokens.size()) + else: + model = None + functions, tokens = asm2vec.utils.load_data(Path(train_set), limit=limit) + + model = asm2vec.utils.train( + functions, + tokens, + model=model, + embedding_size=embedding_size, + batch_size=batch_size, + epochs=epochs, + neg_sample_num=neg_sample, + calc_acc=calc_acc, + device=device, + callback=callback, + learning_rate=lr + ) + asm2vec.utils.save_model(new_model, model, tokens) + + return None diff --git a/scripts/train.py b/scripts/train.py deleted file mode 100644 index 98391f4..0000000 --- a/scripts/train.py +++ /dev/null @@ -1,52 +0,0 @@ -import torch -import click -import asm2vec - -@click.command() -@click.option('-i', '--input', 'ipath', help='training data folder', required=True) -@click.option('-o', '--output', 'opath', default='model.pt', help='output model path', show_default=True) -@click.option('-m', '--model', 'mpath', help='load previous trained model path', type=str) -@click.option('-l', '--limit', help='limit the number of functions to be loaded', show_default=True, type=int) -@click.option('-d', '--ebedding-dimension', 'embedding_size', default=100, help='embedding dimension', show_default=True) -@click.option('-b', '--batch-size', 'batch_size', default=1024, help='batch size', show_default=True) -@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True) -@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True) -@click.option('-a', '--calculate-accuracy', 'calc_acc', help='whether calculate accuracy ( will be significantly slower )', is_flag=True) -@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True) -@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True) -def cli(ipath, opath, mpath, limit, embedding_size, batch_size, epochs, neg_sample_num, calc_acc, device, lr): - if device == 'auto': - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - if mpath: - model, tokens = asm2vec.utils.load_model(mpath, device=device) - functions, tokens_new = asm2vec.utils.load_data(ipath, limit=limit) - tokens.update(tokens_new) - model.update(len(functions), tokens.size()) - else: - model = None - functions, tokens = asm2vec.utils.load_data(ipath, limit=limit) - - def callback(context): - progress = f'{context["epoch"]} | time = {context["time"]:.2f}, loss = {context["loss"]:.4f}' - if context["accuracy"]: - progress += f', accuracy = {context["accuracy"]:.4f}' - print(progress) - asm2vec.utils.save_model(opath, context["model"], context["tokens"]) - - model = asm2vec.utils.train( - functions, - tokens, - model=model, - embedding_size=embedding_size, - batch_size=batch_size, - epochs=epochs, - neg_sample_num=neg_sample_num, - calc_acc=calc_acc, - device=device, - callback=callback, - learning_rate=lr - ) - -if __name__ == '__main__': - cli() From 74be99e44cebdd6f03cec67a56d8eef7e002b4a3 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 2 Oct 2023 23:57:45 +0200 Subject: [PATCH 15/59] AEGIS-6406 Update __init__.py --- asm2vec/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py index d3afa2c..f6e961b 100644 --- a/asm2vec/__init__.py +++ b/asm2vec/__init__.py @@ -1 +1 @@ -__all__ = ["datatype", "model", "utils", "binary_to_asm", "version"] +__all__ = ["datatype", "model", "utils", "binary_to_asm", "train", "version"] From 2755c99e3e3d69d1d73bd9f9ec847a6a4686f197 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 00:20:56 +0200 Subject: [PATCH 16/59] AEGIS-6406 Update __init__.py --- asm2vec/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py index f6e961b..291f06f 100644 --- a/asm2vec/__init__.py +++ b/asm2vec/__init__.py @@ -1 +1 @@ -__all__ = ["datatype", "model", "utils", "binary_to_asm", "train", "version"] +__all__ = ["datatype", "model", "utils", "binary_to_asm", "train", "tensors", "version"] From 26d492c2e81d4b19e34a642fa096dc0c5053df75 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 00:21:54 +0200 Subject: [PATCH 17/59] AEGIS-6406 Create tensors.py Script for calculation of tensor representations --- asm2vec/tensors.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 asm2vec/tensors.py diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py new file mode 100644 index 0000000..0d419c0 --- /dev/null +++ b/asm2vec/tensors.py @@ -0,0 +1,69 @@ +import os +import torch +import logging +import asm2vec +from asm2vec import utils +from pathlib import Path + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.02) -> list: + """Calculates vector representation of a binary as the mean per column + of the vector representations of its assembly functions + :param asm_path: folder with assembly function in a subfolder per binary + :param tensor_path: folder to store the tensors + :param model_path: path to the trained model + :param epochs: number of epochs + :param device: 'auto' | 'cuda' | 'cpu' + :param lr: learning rate + """ + tensors_list = [] + if device == 'auto': + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + if os.path.isfile(model_path): + model, tokens = asm2vec.utils.load_model(model_path, device=device) + else: + print("No valid model") + return [] + + dir0 = Path(tensor_path) + if not (os.path.exists(dir0)): + os.mkdir(dir0) + + if os.path.isdir(asm_path): + obj = os.scandir(asm_path) + for entry in obj: + if entry.is_dir() and os.listdir(entry) and entry.name: + tensor_file = os.path.join(dir0, entry.name) + if not (os.path.exists(tensor_file)): + functions, tokens_new = asm2vec.utils.load_data([entry]) + file_count = sum(len(files) for _, _, files in os.walk(entry)) + tokens.update(tokens_new) + logging.info(f"Binary {entry.name}: {file_count} assembly functions") + model.update(file_count, tokens.size()) + model = model.to(device) + + model = asm2vec.utils.train( + functions, + tokens, + model=model, + epochs=epochs, + device=device, + mode='test', + learning_rate=lr + ) + + tensor = model.to('cpu').embeddings_f(torch.tensor([list(range(0, file_count))])) + tens = torch.squeeze(tensor) + if file_count == 1: + torch.save(tensor, tensor_file) + else: + torch.save(tens.mean(0), tensor_file) + tensors_list.append(entry.name) + + else: + logging.info("No valid directory") + + return tensors_list From 36d29bb74f6931f80f3d1a9da547bb746446bc9a Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 09:45:56 +0200 Subject: [PATCH 18/59] AEGIS-6405 pass magic bytes as variable Magic bytes as variable so that it is usable for other OS/file formats --- asm2vec/binary_to_asm.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py index fe30d9a..1480459 100644 --- a/asm2vec/binary_to_asm.py +++ b/asm2vec/binary_to_asm.py @@ -13,12 +13,13 @@ def _sha3(asm: str) -> str: return hashlib.sha3_256(asm.encode()).hexdigest() -def _valid_exe(filename: str) -> bool: +def _valid_exe(filename: str, magic_bytes) -> bool: """Extracts magic bytes and returns the header :param filename: name of the malware file (SHA1) + :param magic_bytes for the specific OS/type of binary :return: Boolean of the header existing in magic bytes """ - magics = [bytes.fromhex('cffaedfe')] + magics = [bytes.fromhex(magic_bytes)] with open(filename, 'rb') as f: header = f.read(4) return header in magics @@ -67,14 +68,15 @@ def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str: return output -def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int) -> int: +def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes) -> int: """Fragments the input binary into assembly functions via r2pipe :param filename: name of the malware file (SHA1) :param output_path: path to the folder to store the assembly functions for each malware :param asm_minlen: the minimum length of assembly functions to be extracted + :param magic_bytes for the specific OS/type of binary :return: the number of assembly functions """ - if not _valid_exe(filename): + if not _valid_exe(filename, magic_bytes): logging.info('The input file is invalid.') return 0 @@ -98,7 +100,7 @@ def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int) -> int: return count -def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int) -> list: +def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int, magic_bytes='cffaedfe') -> list: """ Extracts assembly functions from malware files and saves them into separate folder per binary :param input_path: the path to the malware binaries @@ -106,6 +108,7 @@ def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int :param minlen_upper: The minimum number of assembly functions needed for disassembling :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number of assembly functions to minlen_lower + :param magic_bytes for the specific OS/type of binary :return: List of sha1 of disassembled malware files """ @@ -123,9 +126,9 @@ def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int out_dir = os.path.join(asm_dir, entry.name) if not (os.path.exists(out_dir)): os.mkdir(out_dir) - function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper) + function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes) if function_count == 0: - function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower) + function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes) if function_count == 0: os.rmdir(out_dir) logging.info('The binary {} was not disassembled'.format(entry.name)) From f47abd0befe33b62871b706b7e186cf19dbc8307 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 10:30:56 +0200 Subject: [PATCH 19/59] AEGIS-6405 fixing logging --- asm2vec/binary_to_asm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py index 1480459..a77b5a7 100644 --- a/asm2vec/binary_to_asm.py +++ b/asm2vec/binary_to_asm.py @@ -5,6 +5,8 @@ import logging from pathlib import Path +logging.basicConfig(level=logging.INFO, format='%(message)s') + def _sha3(asm: str) -> str: """Produces SHA3 for each assembly function From 9c3cf83023dcab680bc7cbf7006768be3157350f Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 10:34:47 +0200 Subject: [PATCH 20/59] AEGIS-6406 Update - JN review Co-authored-by: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> --- asm2vec/tensors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py index 0d419c0..33027a0 100644 --- a/asm2vec/tensors.py +++ b/asm2vec/tensors.py @@ -23,7 +23,7 @@ def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.0 device = 'cuda' if torch.cuda.is_available() else 'cpu' if os.path.isfile(model_path): - model, tokens = asm2vec.utils.load_model(model_path, device=device) + model, tokens = utils.load_model(model_path, device=device) else: print("No valid model") return [] From 9d0ea0fb32a8d86da347f0218f9a20cbcac36aee Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 10:47:20 +0200 Subject: [PATCH 21/59] AEGIS-6406 fix package import, args types --- asm2vec/tensors.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py index 33027a0..4319cd2 100644 --- a/asm2vec/tensors.py +++ b/asm2vec/tensors.py @@ -1,14 +1,18 @@ import os import torch import logging -import asm2vec from asm2vec import utils from pathlib import Path logging.basicConfig(level=logging.INFO, format='%(message)s') -def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.02) -> list: +def calc_tensors(asm_path: str, + tensor_path: str, + model_path: str, + epochs: int, + device: str = 'cpu', + learning_rate: float = 0.02) -> list: """Calculates vector representation of a binary as the mean per column of the vector representations of its assembly functions :param asm_path: folder with assembly function in a subfolder per binary @@ -16,7 +20,7 @@ def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.0 :param model_path: path to the trained model :param epochs: number of epochs :param device: 'auto' | 'cuda' | 'cpu' - :param lr: learning rate + :param learning_rate: learning rate """ tensors_list = [] if device == 'auto': @@ -38,14 +42,14 @@ def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.0 if entry.is_dir() and os.listdir(entry) and entry.name: tensor_file = os.path.join(dir0, entry.name) if not (os.path.exists(tensor_file)): - functions, tokens_new = asm2vec.utils.load_data([entry]) + functions, tokens_new = utils.load_data([entry]) file_count = sum(len(files) for _, _, files in os.walk(entry)) tokens.update(tokens_new) logging.info(f"Binary {entry.name}: {file_count} assembly functions") model.update(file_count, tokens.size()) model = model.to(device) - model = asm2vec.utils.train( + model = utils.train( functions, tokens, model=model, From 045ea32f152c07dcdbecbe83c3b66551a4fc07bc Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 10:57:39 +0200 Subject: [PATCH 22/59] AEGIS-6406 args types, function return --- asm2vec/train.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/asm2vec/train.py b/asm2vec/train.py index 32c11c0..6e6eb3c 100644 --- a/asm2vec/train.py +++ b/asm2vec/train.py @@ -3,6 +3,7 @@ import logging from pathlib import Path from asm2vec import utils +from asm2vec.model import ASM2VEC logging.basicConfig(level=logging.INFO, format='%(message)s') @@ -24,13 +25,13 @@ def train_asm2vec_model( model_path: str | None, limit: int, epochs: int, - calc_acc: False, - embedding_size=100, - batch_size=1024, - neg_sample=25, - lr=0.02, - device='cpu', -) -> None: + calc_acc: bool = False, + embedding_size: int = 100, + batch_size: int = 1024, + neg_sample: int = 25, + learning_rate: float = 0.02, + device: str = 'cpu' +) -> ASM2VEC: """Trains an asm2vec model :param train_set: path to the training dataset :param new_model: path to the model to be trained @@ -44,22 +45,22 @@ def train_asm2vec_model( :param batch_size: the size of batches for training :param neg_sample: negative sampling amount :param device: 'auto' | 'cuda' | 'cpu' - :param lr: learning rate + :param learning_rate: learning rate """ if device == 'auto': device = 'cuda' if torch.cuda.is_available() else 'cpu' if model_path: - model, tokens = asm2vec.utils.load_model(model_path, device=device) - functions, tokens_new = asm2vec.utils.load_data(train_set, limit=limit) + model, tokens = utils.load_model(model_path, device=device) + functions, tokens_new = utils.load_data(train_set, limit=limit) tokens.update(tokens_new) model.update(len(functions), tokens.size()) else: model = None - functions, tokens = asm2vec.utils.load_data(Path(train_set), limit=limit) + functions, tokens = utils.load_data(Path(train_set), limit=limit) - model = asm2vec.utils.train( + model = utils.train( functions, tokens, model=model, @@ -70,8 +71,8 @@ def train_asm2vec_model( calc_acc=calc_acc, device=device, callback=callback, - learning_rate=lr + learning_rate=learning_rate ) - asm2vec.utils.save_model(new_model, model, tokens) + utils.save_model(new_model, model, tokens) - return None + return model From 70acd348bdf3ef932ad3580554ba773f1e57fbcf Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 10:59:14 +0200 Subject: [PATCH 23/59] AEGIS-6406 remove import --- asm2vec/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/asm2vec/train.py b/asm2vec/train.py index 6e6eb3c..f161891 100644 --- a/asm2vec/train.py +++ b/asm2vec/train.py @@ -1,5 +1,4 @@ import torch -import asm2vec import logging from pathlib import Path from asm2vec import utils From 8453b40afc84fadec853af6ce4045cf8e074a178 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 12:33:28 +0200 Subject: [PATCH 24/59] AEGIS-6405 magic bytes as list of strings If none, then use the magic bytes for MacOS --- asm2vec/binary_to_asm.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py index a77b5a7..bfcc33b 100644 --- a/asm2vec/binary_to_asm.py +++ b/asm2vec/binary_to_asm.py @@ -15,13 +15,13 @@ def _sha3(asm: str) -> str: return hashlib.sha3_256(asm.encode()).hexdigest() -def _valid_exe(filename: str, magic_bytes) -> bool: +def _valid_exe(filename: str, magic_bytes: list[str]) -> bool: """Extracts magic bytes and returns the header :param filename: name of the malware file (SHA1) :param magic_bytes for the specific OS/type of binary :return: Boolean of the header existing in magic bytes """ - magics = [bytes.fromhex(magic_bytes)] + magics = [bytes.fromhex(i) for i in magic_bytes] with open(filename, 'rb') as f: header = f.read(4) return header in magics @@ -70,7 +70,7 @@ def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str: return output -def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes) -> int: +def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes: list[str]) -> int: """Fragments the input binary into assembly functions via r2pipe :param filename: name of the malware file (SHA1) :param output_path: path to the folder to store the assembly functions for each malware @@ -102,7 +102,12 @@ def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes) return count -def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int, magic_bytes='cffaedfe') -> list: +def convert_to_asm(input_path: str, + output_path: str, + minlen_upper: int, + minlen_lower: int, + magic_bytes: list[str] = None + ) -> list: """ Extracts assembly functions from malware files and saves them into separate folder per binary :param input_path: the path to the malware binaries @@ -110,9 +115,14 @@ def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int :param minlen_upper: The minimum number of assembly functions needed for disassembling :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number of assembly functions to minlen_lower - :param magic_bytes for the specific OS/type of binary + :param magic_bytes: list of valid for the specific OS/type of binary; e.g. + 'cffaedfe' for Mach-O Little Endian (64-bit) + 'feedfacf' for Mach-O Big Endian (64-bit) + 'cafebabe' Universal Binary Big Endian :return: List of sha1 of disassembled malware files """ + if not magic_bytes: + magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe'] binary_dir = Path(input_path) asm_dir = Path(output_path) From 36eda60399d83cf6b9055e351433746380eb1e9b Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 12:39:48 +0200 Subject: [PATCH 25/59] AEGIS-6405 add more magic bytes for MacOS --- asm2vec/binary_to_asm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py index bfcc33b..3c141d9 100644 --- a/asm2vec/binary_to_asm.py +++ b/asm2vec/binary_to_asm.py @@ -118,11 +118,13 @@ def convert_to_asm(input_path: str, :param magic_bytes: list of valid for the specific OS/type of binary; e.g. 'cffaedfe' for Mach-O Little Endian (64-bit) 'feedfacf' for Mach-O Big Endian (64-bit) + 'cefaedfe' for Mach-O Little Endian (32-bit) + 'feedface': Mach-O Big Endian (32-bit) 'cafebabe' Universal Binary Big Endian :return: List of sha1 of disassembled malware files """ if not magic_bytes: - magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe'] + magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe', 'cefaedfe', 'feedface'] binary_dir = Path(input_path) asm_dir = Path(output_path) From 96a8a00ad3cffd823b4c85c8ffae1dd48ef68452 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 14:26:35 +0200 Subject: [PATCH 26/59] AEGIS-6406 migrate utils.py to train.py --- asm2vec/train.py | 178 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 171 insertions(+), 7 deletions(-) diff --git a/asm2vec/train.py b/asm2vec/train.py index f161891..12b8fe5 100644 --- a/asm2vec/train.py +++ b/asm2vec/train.py @@ -1,12 +1,175 @@ +import os +import time import torch import logging from pathlib import Path -from asm2vec import utils +from torch.utils.data import DataLoader, Dataset from asm2vec.model import ASM2VEC +from asm2vec.datatype import Tokens, Function, Instruction logging.basicConfig(level=logging.INFO, format='%(message)s') +class AsmDataset(Dataset): + def __init__(self, x, y): + self.x = x + self.y = y + + def __len__(self): + return len(self.x) + + def __getitem__(self, index): + return self.x[index], self.y[index] + + +def load_data(paths, limit=None): + if type(paths) is not list: + paths = [paths] + + filenames = [] + for path in paths: + if os.path.isdir(path): + filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) + if os.path.isfile(Path(path) / filename)] + else: + filenames += [Path(path)] + + functions, tokens = [], Tokens() + for i, filename in enumerate(filenames): + if limit and i >= limit: + break + with open(filename) as f: + fn = Function.load(f.read()) + functions.append(fn) + tokens.add(fn.tokens()) + + return functions, tokens + + +def preprocess(functions, tokens): + x, y = [], [] + for i, fn in enumerate(functions): + for seq in fn.random_walk(): + for j in range(1, len(seq) - 1): + x.append([i] + [tokens[token].index for token in seq[j - 1].tokens() + seq[j + 1].tokens()]) + y.append([tokens[token].index for token in seq[j].tokens()]) + return torch.tensor(x), torch.tensor(y) + + +def train( + functions, + tokens, + model=None, + embedding_size=100, + batch_size=1024, + epochs=10, + neg_sample_num=25, + calc_acc=False, + device='cpu', + mode='train', + callback=None, + learning_rate=0.02 +): + if mode == 'train': + if model is None: + model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + elif mode == 'test': + if model is None: + raise ValueError("test mode required pretrained model") + optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate) + else: + raise ValueError("Unknown mode") + + loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True) + for epoch in range(epochs): + start = time.time() + loss_sum, loss_count, accs = 0.0, 0, [] + + model.train() + for i, (inp, pos) in enumerate(loader): + neg = tokens.sample(inp.shape[0], neg_sample_num) + loss = model(inp.to(device), pos.to(device), neg.to(device)) + loss_sum, loss_count = loss_sum + loss, loss_count + 1 + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if i == 0 and calc_acc: + probs = model.predict(inp.to(device), pos.to(device)) + accs.append(accuracy(pos, probs)) + + if callback: + callback({ + 'model': model, + 'tokens': tokens, + 'epoch': epoch, + 'time': time.time() - start, + 'loss': loss_sum / loss_count, + 'accuracy': torch.tensor(accs).mean() if calc_acc else None + }) + + return model + + +def save_model(path, model, tokens): + torch.save({ + 'model_params': ( + model.embeddings.num_embeddings, + model.embeddings_f.num_embeddings, + model.embeddings.embedding_dim + ), + 'model': model.state_dict(), + 'tokens': tokens.state_dict(), + }, path) + + +def load_model(path, device='cpu'): + checkpoint = torch.load(path, map_location=device) + tokens = Tokens() + tokens.load_state_dict(checkpoint['tokens']) + model = ASM2VEC(*checkpoint['model_params']) + model.load_state_dict(checkpoint['model']) + model = model.to(device) + return model, tokens + + +def show_probs(x, y, probs, tokens, limit=None, pretty=False): + if pretty: + tl, tr, bl, br = '┌', '┐', '└', '┘' + lm, rm, tm, bm = '├', '┤', '┬', '┴' + h, v = '─', '│' + arrow = ' ➔' + else: + tl, tr, bl, br = '+', '+', '+', '+' + lm, rm, tm, bm = '+', '+', '+', '+' + h, v = '-', '|' + arrow = '->' + top = probs.topk(5) + for i, (xi, yi) in enumerate(zip(x, y)): + if limit and i >= limit: + break + xi, yi = xi.tolist(), yi.tolist() + print(tl + h * 42 + tr) + print(f'{v} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}') + print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}') + print(f'{v} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}') + print(lm + h * 8 + tm + h * 33 + rm) + for value, index in zip(top.values[i], top.indices[i]): + if index in yi: + colorbegin, colorclear = '\033[92m', '\033[0m' + else: + colorbegin, colorclear = '', '' + print(f'{v} {colorbegin}{value * 100:05.2f}%{colorclear} {v} {colorbegin}' + f'{tokens[index.item()].name:31}{colorclear} {v}') + print(bl + h * 8 + bm + h * 33 + br) + + +def accuracy(y, probs): + return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)])) + + def callback(context) -> None: """Prettifies the display of accuracy, if chosen """ @@ -22,8 +185,8 @@ def train_asm2vec_model( train_set: str, new_model: str, model_path: str | None, - limit: int, epochs: int, + limit: int = None, calc_acc: bool = False, embedding_size: int = 100, batch_size: int = 1024, @@ -31,6 +194,7 @@ def train_asm2vec_model( learning_rate: float = 0.02, device: str = 'cpu' ) -> ASM2VEC: + """Trains an asm2vec model :param train_set: path to the training dataset :param new_model: path to the model to be trained @@ -51,15 +215,15 @@ def train_asm2vec_model( device = 'cuda' if torch.cuda.is_available() else 'cpu' if model_path: - model, tokens = utils.load_model(model_path, device=device) - functions, tokens_new = utils.load_data(train_set, limit=limit) + model, tokens = load_model(model_path, device=device) + functions, tokens_new = load_data(train_set, limit=limit) tokens.update(tokens_new) model.update(len(functions), tokens.size()) else: model = None - functions, tokens = utils.load_data(Path(train_set), limit=limit) + functions, tokens = load_data(Path(train_set), limit=limit) - model = utils.train( + model = train( functions, tokens, model=model, @@ -72,6 +236,6 @@ def train_asm2vec_model( callback=callback, learning_rate=learning_rate ) - utils.save_model(new_model, model, tokens) + save_model(new_model, model, tokens) return model From a3bc3d08735cf4c7957b6ab856a3f40f92b3347a Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 14:27:15 +0200 Subject: [PATCH 27/59] AEGIS-6406 remove utils --- asm2vec/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py index 291f06f..2f3c046 100644 --- a/asm2vec/__init__.py +++ b/asm2vec/__init__.py @@ -1 +1 @@ -__all__ = ["datatype", "model", "utils", "binary_to_asm", "train", "tensors", "version"] +__all__ = ["datatype", "model", "binary_to_asm", "train", "tensors", "version"] From b73b9393cf75101090934f1b553156cd3056cf7e Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 14:37:40 +0200 Subject: [PATCH 28/59] AEGIS-6406 fix imports to account for moving utils.py to train.py --- asm2vec/tensors.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py index 4319cd2..01b306f 100644 --- a/asm2vec/tensors.py +++ b/asm2vec/tensors.py @@ -1,7 +1,7 @@ import os import torch import logging -from asm2vec import utils +from asm2vec.train import train, load_model, load_data from pathlib import Path logging.basicConfig(level=logging.INFO, format='%(message)s') @@ -27,7 +27,7 @@ def calc_tensors(asm_path: str, device = 'cuda' if torch.cuda.is_available() else 'cpu' if os.path.isfile(model_path): - model, tokens = utils.load_model(model_path, device=device) + model, tokens = load_model(model_path, device=device) else: print("No valid model") return [] @@ -42,21 +42,21 @@ def calc_tensors(asm_path: str, if entry.is_dir() and os.listdir(entry) and entry.name: tensor_file = os.path.join(dir0, entry.name) if not (os.path.exists(tensor_file)): - functions, tokens_new = utils.load_data([entry]) + functions, tokens_new = load_data([entry]) file_count = sum(len(files) for _, _, files in os.walk(entry)) tokens.update(tokens_new) logging.info(f"Binary {entry.name}: {file_count} assembly functions") model.update(file_count, tokens.size()) model = model.to(device) - model = utils.train( + model = train( functions, tokens, model=model, epochs=epochs, device=device, mode='test', - learning_rate=lr + learning_rate=learning_rate ) tensor = model.to('cpu').embeddings_f(torch.tensor([list(range(0, file_count))])) From bd8bcd780e9a3d8c6cdcb89ab65b7695efa04d39 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 15:23:03 +0200 Subject: [PATCH 29/59] AEGIS-6406 Delete asm2vec/utils.py The code has been migrated to asm2vec/train.py --- asm2vec/utils.py | 167 ----------------------------------------------- 1 file changed, 167 deletions(-) delete mode 100644 asm2vec/utils.py diff --git a/asm2vec/utils.py b/asm2vec/utils.py deleted file mode 100644 index b233d33..0000000 --- a/asm2vec/utils.py +++ /dev/null @@ -1,167 +0,0 @@ -import os -import time -import torch -from torch.utils.data import DataLoader, Dataset -from pathlib import Path -from asm2vec.datatype import Tokens, Function, Instruction -from asm2vec.model import ASM2VEC - - -class AsmDataset(Dataset): - def __init__(self, x, y): - self.x = x - self.y = y - - def __len__(self): - return len(self.x) - - def __getitem__(self, index): - return self.x[index], self.y[index] - - -def load_data(paths, limit=None): - if type(paths) is not list: - paths = [paths] - - filenames = [] - for path in paths: - if os.path.isdir(path): - filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) - if os.path.isfile(Path(path) / filename)] - else: - filenames += [Path(path)] - - functions, tokens = [], Tokens() - for i, filename in enumerate(filenames): - if limit and i >= limit: - break - with open(filename) as f: - fn = Function.load(f.read()) - functions.append(fn) - tokens.add(fn.tokens()) - - return functions, tokens - - -def preprocess(functions, tokens): - x, y = [], [] - for i, fn in enumerate(functions): - for seq in fn.random_walk(): - for j in range(1, len(seq) - 1): - x.append([i] + [tokens[token].index for token in seq[j-1].tokens() + seq[j+1].tokens()]) - y.append([tokens[token].index for token in seq[j].tokens()]) - return torch.tensor(x), torch.tensor(y) - - -def train( - functions, - tokens, - model=None, - embedding_size=100, - batch_size=1024, - epochs=10, - neg_sample_num=25, - calc_acc=False, - device='cpu', - mode='train', - callback=None, - learning_rate=0.02 -): - if mode == 'train': - if model is None: - model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device) - optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) - elif mode == 'test': - if model is None: - raise ValueError("test mode required pretrained model") - optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate) - else: - raise ValueError("Unknown mode") - - loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True) - for epoch in range(epochs): - start = time.time() - loss_sum, loss_count, accs = 0.0, 0, [] - - model.train() - for i, (inp, pos) in enumerate(loader): - neg = tokens.sample(inp.shape[0], neg_sample_num) - loss = model(inp.to(device), pos.to(device), neg.to(device)) - loss_sum, loss_count = loss_sum + loss, loss_count + 1 - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if i == 0 and calc_acc: - probs = model.predict(inp.to(device), pos.to(device)) - accs.append(accuracy(pos, probs)) - - if callback: - callback({ - 'model': model, - 'tokens': tokens, - 'epoch': epoch, - 'time': time.time() - start, - 'loss': loss_sum / loss_count, - 'accuracy': torch.tensor(accs).mean() if calc_acc else None - }) - - return model - - -def save_model(path, model, tokens): - torch.save({ - 'model_params': ( - model.embeddings.num_embeddings, - model.embeddings_f.num_embeddings, - model.embeddings.embedding_dim - ), - 'model': model.state_dict(), - 'tokens': tokens.state_dict(), - }, path) - - -def load_model(path, device='cpu'): - checkpoint = torch.load(path, map_location=device) - tokens = Tokens() - tokens.load_state_dict(checkpoint['tokens']) - model = ASM2VEC(*checkpoint['model_params']) - model.load_state_dict(checkpoint['model']) - model = model.to(device) - return model, tokens - - -def show_probs(x, y, probs, tokens, limit=None, pretty=False): - if pretty: - tl, tr, bl, br = '┌', '┐', '└', '┘' - lm, rm, tm, bm = '├', '┤', '┬', '┴' - h, v = '─', '│' - arrow = ' ➔' - else: - tl, tr, bl, br = '+', '+', '+', '+' - lm, rm, tm, bm = '+', '+', '+', '+' - h, v = '-', '|' - arrow = '->' - top = probs.topk(5) - for i, (xi, yi) in enumerate(zip(x, y)): - if limit and i >= limit: - break - xi, yi = xi.tolist(), yi.tolist() - print(tl + h * 42 + tr) - print(f'{v} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}') - print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}') - print(f'{v} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}') - print(lm + h * 8 + tm + h * 33 + rm) - for value, index in zip(top.values[i], top.indices[i]): - if index in yi: - colorbegin, colorclear = '\033[92m', '\033[0m' - else: - colorbegin, colorclear = '', '' - print(f'{v} {colorbegin}{value*100:05.2f}%{colorclear} {v} {colorbegin}' - f'{tokens[index.item()].name:31}{colorclear} {v}') - print(bl + h * 8 + bm + h * 33 + br) - - -def accuracy(y, probs): - return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)])) From 91bfc9061ee3d72eec824d0cf3f8d55dacb7d31b Mon Sep 17 00:00:00 2001 From: "CI2.0" Date: Tue, 3 Oct 2023 13:49:46 +0000 Subject: [PATCH 30/59] [Jenkins] Set version to 1.0.2 --- asm2vec/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/version.py b/asm2vec/version.py index f1ae280..d6f3f4b 100644 --- a/asm2vec/version.py +++ b/asm2vec/version.py @@ -1,2 +1,2 @@ -VERSION = '1.0.1' +VERSION = '1.0.2' DEV_VERSION = '0' From 8751b197e516153bff72fa6535f738f6457ae121 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 19:52:53 +0200 Subject: [PATCH 31/59] AEGIS-6405 Create test_binary_to_asm.py --- test/test_binary_to_asm.py | 141 +++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 test/test_binary_to_asm.py diff --git a/test/test_binary_to_asm.py b/test/test_binary_to_asm.py new file mode 100644 index 0000000..f042166 --- /dev/null +++ b/test/test_binary_to_asm.py @@ -0,0 +1,141 @@ +from pathlib import Path +from unittest import TestCase +from asm2vec.binary_to_asm import (bin_to_asm, + convert_to_asm, + _fn_to_asm, + _normalize, + _sha3, + _valid_exe) + + +class TestBinaryToAsm(TestCase): + + @classmethod + def setUpClass(cls): + print("\n--- TestBinaryToAsm ---") + cls.output_path = 'malware_asm/' + cls.pdf_dict = {'name': 'main', 'size': 18, 'addr': 4294974144, + 'ops': [{'offset': 4294974144, 'esil': 'rbp,8,rsp,-,=[8],8,rsp,-=', 'refptr': 0, + 'fcn_addr': 4294974144, 'fcn_last': 4294974161, 'size': 1, 'opcode': 'push rbp', + 'disasm': 'push rbp', 'bytes': '55', 'family': 'cpu', 'type': 'rpush', + 'reloc': 'False', 'type_num': 268435468, 'type2_num': 0, + 'flags': ['main', 'entry0', 'section.0.__TEXT.__text', 'sym.func.100001ac0', 'rip'], + 'comment': 'WzAwXSAtci14IHNlY3Rpb24gc2l6ZSA3Mzc2IG5hbWVkIDAuX19URVhULl9fdGV4dA=='}, + {'offset': 4294974145, 'esil': 'rsp,rbp,=', 'refptr': 0, 'fcn_addr': 4294974144, + 'fcn_last': 4294974159, 'size': 3, 'opcode': 'mov rbp, rsp', 'disasm': 'mov rbp, rsp', + 'bytes': '4889e5', 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9, + 'type2_num': 0}, {'offset': 4294974148, 'esil': 'rbx,8,rsp,-,=[8],8,rsp,-=', + 'refptr': 0, 'fcn_addr': 4294974144, 'fcn_last': 4294974161, + 'size': 1, 'opcode': 'push rbx', 'disasm': 'push rbx', 'bytes': '53', + 'family': 'cpu', 'type': 'rpush', 'reloc': 'False', + 'type_num': 268435468, 'type2_num': 0}, + {'offset': 4294974149, 'esil': 'rax,8,rsp,-,=[8],8,rsp,-=', 'refptr': 0, + 'fcn_addr': 4294974144, 'fcn_last': 4294974161, 'size': 1, 'opcode': 'push rax', + 'disasm': 'push rax', 'bytes': '50', 'family': 'cpu', 'type': 'rpush', + 'reloc': 'False', 'type_num': 268435468, 'type2_num': 0}, + {'offset': 4294974150, 'esil': 'rsi,rbx,=', 'refptr': 0, 'fcn_addr': 4294974144, + 'fcn_last': 4294974159, 'size': 3, 'opcode': 'mov rbx, rsi', 'disasm': 'mov rbx, rsi', + 'bytes': '4889f3', 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9, + 'type2_num': 0}, {'offset': 4294974153, 'ptr': 4294985864, + 'esil': '0x2db8,rip,+,[8],rax,=', 'refptr': 8, + 'fcn_addr': 4294974144, 'fcn_last': 4294974155, 'size': 7, + 'opcode': 'mov rax, qword [rip + 0x2db8]', + 'disasm': 'mov rax, qword [0x100004888]', 'bytes': '488b05b82d0000', + 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9, + 'type2_num': 0, 'refs': [{'addr': 4294985864, 'type': 'DATA', + 'perm': 'r--'}]}, {'offset': 4294974160, + 'esil': 'rax,rip,=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974160, + 'size': 2, + 'opcode': 'jmp rax', + 'disasm': 'jmp rax', + 'bytes': 'ffe0', + 'family': 'cpu', + 'type': 'rjmp', + 'reloc': 'False', + 'type_num': 268435458, + 'type2_num': 0}]} + + def test_sha3(self): + """Should return 64-character long string""" + asm = ("push rbp\n" + "mov rbp, rsp\n" + "push rbx\n" + "push rax\n" + "mov rbx, rsi\n" + "mov rax, qword [rip + CONST]\n" + "jmp rax") + self.assertRegex(_sha3(asm), '^[a-f0-9]{64}') + + def test_valid_exe_when_valid_magic_bytes(self): + """Should return boolean""" + binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80" + filename = Path(binary_location) + magic_bytes = ['cffaedfe'] + self.assertEqual(_valid_exe(filename, magic_bytes), True) + + def test_valid_exe_when_not_valid_magic_bytes(self): + """Should return boolean""" + binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80" + filename = Path(binary_location) + magic_bytes = ['cafebabe'] + self.assertEqual(_valid_exe(filename, magic_bytes), False) + + def test_normalize_when_offset(self): + """Should return normalized opcode""" + opcode = "mov rax, qword [rip + 0x2db8]" + expected_norm_opcode = "mov rax, qword [rip + CONST]" + self.assertEqual(_normalize(opcode), expected_norm_opcode) + + def test_normalize_when_no_offset(self): + """Should return normalized opcode""" + opcode = 'mov rbx, rsi' + expected_norm_opcode = "mov rbx, rsi" + self.assertEqual(_normalize(opcode), expected_norm_opcode) + + def test_fn_to_asm_returns_empty_string_when_pdf_none(self): + """Should return assembly functions with normalized opcode""" + pdf = None + asm_min = 5 + expected_asm = "" + self.assertEqual(_fn_to_asm(pdf, asm_min), expected_asm) + + def test_fn_to_asm_returns_empty_string_when_pdfops_shorter_than_minlen(self): + """Should return assembly functions with normalized opcode""" + asm_minlen = 10 + expected_asm = "" + self.assertEqual(_fn_to_asm(self.pdf_dict, asm_minlen), expected_asm) + + def test_fn_to_asm_returns_expected_asm(self): + """Should return assembly functions with normalized opcode""" + asm_min = 5 + expected_asm = (" push rbp\n" + " mov rbp, rsp\n" + " push rbx\n" + " push rax\n" + " mov rbx, rsi\n" + " mov rax, qword [rip + CONST]\n" + " jmp rax\n") + self.assertEqual(_fn_to_asm(self.pdf_dict, asm_min), expected_asm) + + def test_bin_to_asm_returns_expected_number_of_disassembled_files(self): + binary_location = "malware/5cca32eb8f9c2a024a57ce12e3fb66070662de80" + asm_minlen = 5 + magic_bytes = ['cffaedfe'] + self.assertEqual(bin_to_asm(Path(binary_location), Path(self.output_path), asm_minlen, magic_bytes), 1) + + def test_bin_to_asm_returns_expected_number_of_disassembled_files_when_pdfops_shorter_than_minlen(self): + binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80" + asm_minlen = 10 + magic_bytes = ['cffaedfe'] + self.assertEqual(bin_to_asm(Path(binary_location), self.output_path, asm_minlen, magic_bytes), 0) + + def test_convert_to_asm_returns_expected_sha1(self): + input_path = 'malware_bin/' + asm_minlen_upper = 10 + asm_minlen_lower = 5 + expected_sha1 = ["5cca32eb8f9c2a024a57ce12e3fb66070662de80"] + self.assertEqual(convert_to_asm(input_path, self.output_path, asm_minlen_upper, asm_minlen_lower), + expected_sha1) From 0a990f9e25a5b5f20e64484a614bdea503768783 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 19:54:35 +0200 Subject: [PATCH 32/59] AEGIS-6405 Create __init__.py --- test/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 test/__init__.py diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..472793c --- /dev/null +++ b/test/__init__.py @@ -0,0 +1 @@ +__all__ = ["test_binary_to_asm"] From 991f9cec04b7a03efae7b2256ba0857b82cd11b2 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:00:43 +0200 Subject: [PATCH 33/59] AEGIS-6405 Create sample_binary --- asm2vec/data/sample_binary | 1 + 1 file changed, 1 insertion(+) create mode 100644 asm2vec/data/sample_binary diff --git a/asm2vec/data/sample_binary b/asm2vec/data/sample_binary new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/asm2vec/data/sample_binary @@ -0,0 +1 @@ + From 85a8b95014c3248a2678ebb761cbabe21c7f5cf7 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:03:24 +0200 Subject: [PATCH 34/59] AEGIS-6405 upload test binary This malware file is added for the purposes of test/test_binary_to_asm.py. To use locally, it should be placed at the malware_bin/ directory, as indicated in the unit test. --- .../5cca32eb8f9c2a024a57ce12e3fb66070662de80 | Bin 0 -> 33056 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 diff --git a/asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 new file mode 100644 index 0000000000000000000000000000000000000000..208607f77c28082e1b391a5c7b16333894760e2d GIT binary patch literal 33056 zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F zefn6V*Bs6ex){l>Q450V_$6FvJ;QzWGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v z{n_sMwO@F=rOpA)?3S4RJUhzoX2zV|Mx2X@B7_29mo6i7{Atw z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4! zF@F8n{Y_j$T$cm}IJVzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK- z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbYC(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<H)_9*8m3t^?|E^ z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U zpa^gVuod`&4F}FN;0Is_&fQ7(YKxN=%UoLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHfVsM>G|Dj*d1-KgN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4 zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7 z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oquMOoDAFuGyoa`BY}FrF~Bvz z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-& z1hfNw0g3=;09%1S*r4Yl;0Is_&8%ts3^#^??$S{dbeE2xpu05tbezv{ z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7 z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW%tTxfV ze(KWR$={FdFw2ln1H+PXVQXO2A`4aiAP9 z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{ zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P? zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi

h$8;7j0JpbPLBuny=5d<=X6oCmxQdsZ3p`0k7beX=(uGM8LKh~* zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$ zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$ z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax zZzyQAc|k#=O%(-=HV-Liw7E+`qsE#M1cd_U4BL|DTmf={*}(Ha1K?3$ z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ z&<@Do!GTi_v23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a) zHbeOhEt=D4qeelaOde0MP)y}nD@LKq>B+WqIEuh9AO|Q53%SPP5* zZUoiP1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77 zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxlGolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh- z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE z!wn$&putk)7(^78$rj4R0bD^*h^ z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR| zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aAq?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h) zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p zTU+#Wi>_BzeE-%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1 zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7 zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4! zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfPm7h0UQ{3xb0lb4zMfT zFu(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz? zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-XS8`CBJcV zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{zUWiIL`JtCs zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*emV(FNm=FdX;oaiaxVd<(x z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU z_RpjGAIy9AyzG>1SpV^tiLGcHA&< ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8 zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t! z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru zeiqxRa z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rfyv0P>JlajyQFMDU4XKE7EwQjfd%#OoeSpeOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5 zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsTE-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8 z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ zkHbLIj|b0XydQhjk@Qp zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O| zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV? z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Ynr>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6 zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_ zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7lt0{iNUoF z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK! z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ z**ZU6jNAaLx zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;& z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY KhC61I(EkBO5z4Rt literal 0 HcmV?d00001 From 38cf710460dbb3615b237b4028462623646da520 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:03:47 +0200 Subject: [PATCH 35/59] AEGIS-6405 Delete asm2vec/data/sample_binary --- asm2vec/data/sample_binary | 1 - 1 file changed, 1 deletion(-) delete mode 100644 asm2vec/data/sample_binary diff --git a/asm2vec/data/sample_binary b/asm2vec/data/sample_binary deleted file mode 100644 index 8b13789..0000000 --- a/asm2vec/data/sample_binary +++ /dev/null @@ -1 +0,0 @@ - From 6f73e51a6a1e080cdcf457658850c8dc241c8e8b Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:04:50 +0200 Subject: [PATCH 36/59] AEGIS-6405 Delete asm2vec/data directory --- .../5cca32eb8f9c2a024a57ce12e3fb66070662de80 | Bin 33056 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 diff --git a/asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 deleted file mode 100644 index 208607f77c28082e1b391a5c7b16333894760e2d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 33056 zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F zefn6V*Bs6ex){l>Q450V_$6FvJ;QzWGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v z{n_sMwO@F=rOpA)?3S4RJUhzoX2zV|Mx2X@B7_29mo6i7{Atw z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4! zF@F8n{Y_j$T$cm}IJVzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK- z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbYC(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<H)_9*8m3t^?|E^ z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U zpa^gVuod`&4F}FN;0Is_&fQ7(YKxN=%UoLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHfVsM>G|Dj*d1-KgN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4 zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7 z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oquMOoDAFuGyoa`BY}FrF~Bvz z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-& z1hfNw0g3=;09%1S*r4Yl;0Is_&8%ts3^#^??$S{dbeE2xpu05tbezv{ z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7 z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW%tTxfV ze(KWR$={FdFw2ln1H+PXVQXO2A`4aiAP9 z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{ zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P? zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi

h$8;7j0JpbPLBuny=5d<=X6oCmxQdsZ3p`0k7beX=(uGM8LKh~* zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$ zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$ z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax zZzyQAc|k#=O%(-=HV-Liw7E+`qsE#M1cd_U4BL|DTmf={*}(Ha1K?3$ z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ z&<@Do!GTi_v23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a) zHbeOhEt=D4qeelaOde0MP)y}nD@LKq>B+WqIEuh9AO|Q53%SPP5* zZUoiP1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77 zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxlGolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh- z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE z!wn$&putk)7(^78$rj4R0bD^*h^ z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR| zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aAq?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h) zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p zTU+#Wi>_BzeE-%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1 zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7 zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4! zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfPm7h0UQ{3xb0lb4zMfT zFu(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz? zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-XS8`CBJcV zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{zUWiIL`JtCs zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*emV(FNm=FdX;oaiaxVd<(x z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU z_RpjGAIy9AyzG>1SpV^tiLGcHA&< ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8 zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t! z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru zeiqxRa z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rfyv0P>JlajyQFMDU4XKE7EwQjfd%#OoeSpeOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5 zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsTE-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8 z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ zkHbLIj|b0XydQhjk@Qp zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O| zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV? z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Ynr>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6 zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_ zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7lt0{iNUoF z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK! z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ z**ZU6jNAaLx zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;& z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY KhC61I(EkBO5z4Rt From d9f3f998ecd92182059e25cc5f1ccab98a96a3e9 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:05:18 +0200 Subject: [PATCH 37/59] AEGIS-6405 Create sample_binary --- data/sample_binary | 1 + 1 file changed, 1 insertion(+) create mode 100644 data/sample_binary diff --git a/data/sample_binary b/data/sample_binary new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/data/sample_binary @@ -0,0 +1 @@ + From 383dfedbe5196fe5be8400eadb1341965a482aa1 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:07:14 +0200 Subject: [PATCH 38/59] Add files via upload Adding malware file for the purposes of test/test_binary_to_asm.py. To use locally, the binary shall be placed in the malware_bin/ folder, as indicated in the unit test. --- data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 | Bin 0 -> 33056 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 diff --git a/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 new file mode 100644 index 0000000000000000000000000000000000000000..208607f77c28082e1b391a5c7b16333894760e2d GIT binary patch literal 33056 zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F zefn6V*Bs6ex){l>Q450V_$6FvJ;QzWGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v z{n_sMwO@F=rOpA)?3S4RJUhzoX2zV|Mx2X@B7_29mo6i7{Atw z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4! zF@F8n{Y_j$T$cm}IJVzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK- z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbYC(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<H)_9*8m3t^?|E^ z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U zpa^gVuod`&4F}FN;0Is_&fQ7(YKxN=%UoLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHfVsM>G|Dj*d1-KgN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4 zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7 z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oquMOoDAFuGyoa`BY}FrF~Bvz z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-& z1hfNw0g3=;09%1S*r4Yl;0Is_&8%ts3^#^??$S{dbeE2xpu05tbezv{ z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7 z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW%tTxfV ze(KWR$={FdFw2ln1H+PXVQXO2A`4aiAP9 z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{ zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P? zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi

h$8;7j0JpbPLBuny=5d<=X6oCmxQdsZ3p`0k7beX=(uGM8LKh~* zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$ zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$ z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax zZzyQAc|k#=O%(-=HV-Liw7E+`qsE#M1cd_U4BL|DTmf={*}(Ha1K?3$ z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ z&<@Do!GTi_v23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a) zHbeOhEt=D4qeelaOde0MP)y}nD@LKq>B+WqIEuh9AO|Q53%SPP5* zZUoiP1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77 zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxlGolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh- z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE z!wn$&putk)7(^78$rj4R0bD^*h^ z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR| zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aAq?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h) zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p zTU+#Wi>_BzeE-%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1 zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7 zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4! zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfPm7h0UQ{3xb0lb4zMfT zFu(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz? zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-XS8`CBJcV zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{zUWiIL`JtCs zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*emV(FNm=FdX;oaiaxVd<(x z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU z_RpjGAIy9AyzG>1SpV^tiLGcHA&< ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8 zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t! z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru zeiqxRa z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rfyv0P>JlajyQFMDU4XKE7EwQjfd%#OoeSpeOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5 zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsTE-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8 z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ zkHbLIj|b0XydQhjk@Qp zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O| zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV? z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Ynr>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6 zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_ zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7lt0{iNUoF z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK! z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ z**ZU6jNAaLx zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;& z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY KhC61I(EkBO5z4Rt literal 0 HcmV?d00001 From b057394aa16e9af666f9aeff5e0aa3da5fbcb209 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:08:02 +0200 Subject: [PATCH 39/59] AEGIS-6405 Delete data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 --- data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 | Bin 33056 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 diff --git a/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 deleted file mode 100644 index 208607f77c28082e1b391a5c7b16333894760e2d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 33056 zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F zefn6V*Bs6ex){l>Q450V_$6FvJ;QzWGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v z{n_sMwO@F=rOpA)?3S4RJUhzoX2zV|Mx2X@B7_29mo6i7{Atw z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4! zF@F8n{Y_j$T$cm}IJVzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK- z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbYC(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<H)_9*8m3t^?|E^ z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U zpa^gVuod`&4F}FN;0Is_&fQ7(YKxN=%UoLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHfVsM>G|Dj*d1-KgN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4 zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7 z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oquMOoDAFuGyoa`BY}FrF~Bvz z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-& z1hfNw0g3=;09%1S*r4Yl;0Is_&8%ts3^#^??$S{dbeE2xpu05tbezv{ z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7 z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW%tTxfV ze(KWR$={FdFw2ln1H+PXVQXO2A`4aiAP9 z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{ zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P? zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi

h$8;7j0JpbPLBuny=5d<=X6oCmxQdsZ3p`0k7beX=(uGM8LKh~* zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$ zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$ z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax zZzyQAc|k#=O%(-=HV-Liw7E+`qsE#M1cd_U4BL|DTmf={*}(Ha1K?3$ z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ z&<@Do!GTi_v23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a) zHbeOhEt=D4qeelaOde0MP)y}nD@LKq>B+WqIEuh9AO|Q53%SPP5* zZUoiP1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77 zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxlGolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh- z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE z!wn$&putk)7(^78$rj4R0bD^*h^ z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR| zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aAq?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h) zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p zTU+#Wi>_BzeE-%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1 zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7 zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4! zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfPm7h0UQ{3xb0lb4zMfT zFu(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz? zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-XS8`CBJcV zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{zUWiIL`JtCs zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*emV(FNm=FdX;oaiaxVd<(x z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU z_RpjGAIy9AyzG>1SpV^tiLGcHA&< ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8 zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t! z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru zeiqxRa z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rfyv0P>JlajyQFMDU4XKE7EwQjfd%#OoeSpeOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5 zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsTE-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8 z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ zkHbLIj|b0XydQhjk@Qp zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O| zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV? z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Ynr>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6 zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_ zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7lt0{iNUoF z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK! z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ z**ZU6jNAaLx zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;& z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY KhC61I(EkBO5z4Rt From 047e41895a30fcc24c2fb0655e1c463d20e59e38 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:09:59 +0200 Subject: [PATCH 40/59] AEGIS-6405 add sample binary For the purposes of test/test_binary_to_asm.py. To be placed in the malware_bin/ folder, as indicated in the unit test. --- data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 | Bin 0 -> 33056 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 diff --git a/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 new file mode 100644 index 0000000000000000000000000000000000000000..208607f77c28082e1b391a5c7b16333894760e2d GIT binary patch literal 33056 zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F zefn6V*Bs6ex){l>Q450V_$6FvJ;QzWGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v z{n_sMwO@F=rOpA)?3S4RJUhzoX2zV|Mx2X@B7_29mo6i7{Atw z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4! zF@F8n{Y_j$T$cm}IJVzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK- z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbYC(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<H)_9*8m3t^?|E^ z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U zpa^gVuod`&4F}FN;0Is_&fQ7(YKxN=%UoLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHfVsM>G|Dj*d1-KgN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4 zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7 z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oquMOoDAFuGyoa`BY}FrF~Bvz z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-& z1hfNw0g3=;09%1S*r4Yl;0Is_&8%ts3^#^??$S{dbeE2xpu05tbezv{ z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7 z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW%tTxfV ze(KWR$={FdFw2ln1H+PXVQXO2A`4aiAP9 z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{ zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P? zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi

h$8;7j0JpbPLBuny=5d<=X6oCmxQdsZ3p`0k7beX=(uGM8LKh~* zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$ zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$ z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax zZzyQAc|k#=O%(-=HV-Liw7E+`qsE#M1cd_U4BL|DTmf={*}(Ha1K?3$ z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ z&<@Do!GTi_v23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a) zHbeOhEt=D4qeelaOde0MP)y}nD@LKq>B+WqIEuh9AO|Q53%SPP5* zZUoiP1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77 zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxlGolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh- z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE z!wn$&putk)7(^78$rj4R0bD^*h^ z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR| zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aAq?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h) zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p zTU+#Wi>_BzeE-%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1 zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7 zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4! zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfPm7h0UQ{3xb0lb4zMfT zFu(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz? zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-XS8`CBJcV zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{zUWiIL`JtCs zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*emV(FNm=FdX;oaiaxVd<(x z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU z_RpjGAIy9AyzG>1SpV^tiLGcHA&< ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8 zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t! z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru zeiqxRa z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rfyv0P>JlajyQFMDU4XKE7EwQjfd%#OoeSpeOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5 zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsTE-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8 z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ zkHbLIj|b0XydQhjk@Qp zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O| zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV? z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Ynr>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6 zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_ zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7lt0{iNUoF z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK! z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ z**ZU6jNAaLx zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;& z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY KhC61I(EkBO5z4Rt literal 0 HcmV?d00001 From d4af35d6b44547ab7de221b01f3808d02417b925 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:10:50 +0200 Subject: [PATCH 41/59] AEGIS-6405 fix path --- test/test_binary_to_asm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_binary_to_asm.py b/test/test_binary_to_asm.py index f042166..c9cbc10 100644 --- a/test/test_binary_to_asm.py +++ b/test/test_binary_to_asm.py @@ -121,7 +121,7 @@ def test_fn_to_asm_returns_expected_asm(self): self.assertEqual(_fn_to_asm(self.pdf_dict, asm_min), expected_asm) def test_bin_to_asm_returns_expected_number_of_disassembled_files(self): - binary_location = "malware/5cca32eb8f9c2a024a57ce12e3fb66070662de80" + binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80" asm_minlen = 5 magic_bytes = ['cffaedfe'] self.assertEqual(bin_to_asm(Path(binary_location), Path(self.output_path), asm_minlen, magic_bytes), 1) From fb3b5070ec62b64cae7ca74c3812d5fbb49669c9 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Tue, 3 Oct 2023 20:13:29 +0200 Subject: [PATCH 42/59] AEGIS-6405 Delete data/sample_binary --- data/sample_binary | 1 - 1 file changed, 1 deletion(-) delete mode 100644 data/sample_binary diff --git a/data/sample_binary b/data/sample_binary deleted file mode 100644 index 8b13789..0000000 --- a/data/sample_binary +++ /dev/null @@ -1 +0,0 @@ - From df44a2f72d8311c3bb0a1f28aff91ff3768ae97c Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:44:33 +0100 Subject: [PATCH 43/59] AEGIS-6405 - test fix --- asm2vec/__init__.py | 7 +- test/test_binary_to_asm.py | 224 ++++++++++++++++++++++++++----------- 2 files changed, 162 insertions(+), 69 deletions(-) diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py index 2f3c046..2d9cfd9 100644 --- a/asm2vec/__init__.py +++ b/asm2vec/__init__.py @@ -1 +1,6 @@ -__all__ = ["datatype", "model", "binary_to_asm", "train", "tensors", "version"] +import os + +__home__ = os.path.dirname(os.path.abspath(__path__[0])) +__data__ = os.path.join(__home__, "data") + +__all__ = ["__data__", "__home__", "binary_to_asm", "datatype", "model", "tensors", "train", "utils", "version"] diff --git a/test/test_binary_to_asm.py b/test/test_binary_to_asm.py index c9cbc10..ce53411 100644 --- a/test/test_binary_to_asm.py +++ b/test/test_binary_to_asm.py @@ -1,62 +1,154 @@ +from os import path, mkdir from pathlib import Path +from shutil import rmtree from unittest import TestCase -from asm2vec.binary_to_asm import (bin_to_asm, - convert_to_asm, - _fn_to_asm, - _normalize, - _sha3, - _valid_exe) + +from asm2vec import __data__ +from asm2vec.binary_to_asm import (bin_to_asm, convert_to_asm, _fn_to_asm, _normalize, _sha3, _valid_exe) class TestBinaryToAsm(TestCase): @classmethod - def setUpClass(cls): + def setUpClass(cls) -> None: print("\n--- TestBinaryToAsm ---") - cls.output_path = 'malware_asm/' - cls.pdf_dict = {'name': 'main', 'size': 18, 'addr': 4294974144, - 'ops': [{'offset': 4294974144, 'esil': 'rbp,8,rsp,-,=[8],8,rsp,-=', 'refptr': 0, - 'fcn_addr': 4294974144, 'fcn_last': 4294974161, 'size': 1, 'opcode': 'push rbp', - 'disasm': 'push rbp', 'bytes': '55', 'family': 'cpu', 'type': 'rpush', - 'reloc': 'False', 'type_num': 268435468, 'type2_num': 0, - 'flags': ['main', 'entry0', 'section.0.__TEXT.__text', 'sym.func.100001ac0', 'rip'], - 'comment': 'WzAwXSAtci14IHNlY3Rpb24gc2l6ZSA3Mzc2IG5hbWVkIDAuX19URVhULl9fdGV4dA=='}, - {'offset': 4294974145, 'esil': 'rsp,rbp,=', 'refptr': 0, 'fcn_addr': 4294974144, - 'fcn_last': 4294974159, 'size': 3, 'opcode': 'mov rbp, rsp', 'disasm': 'mov rbp, rsp', - 'bytes': '4889e5', 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9, - 'type2_num': 0}, {'offset': 4294974148, 'esil': 'rbx,8,rsp,-,=[8],8,rsp,-=', - 'refptr': 0, 'fcn_addr': 4294974144, 'fcn_last': 4294974161, - 'size': 1, 'opcode': 'push rbx', 'disasm': 'push rbx', 'bytes': '53', - 'family': 'cpu', 'type': 'rpush', 'reloc': 'False', - 'type_num': 268435468, 'type2_num': 0}, - {'offset': 4294974149, 'esil': 'rax,8,rsp,-,=[8],8,rsp,-=', 'refptr': 0, - 'fcn_addr': 4294974144, 'fcn_last': 4294974161, 'size': 1, 'opcode': 'push rax', - 'disasm': 'push rax', 'bytes': '50', 'family': 'cpu', 'type': 'rpush', - 'reloc': 'False', 'type_num': 268435468, 'type2_num': 0}, - {'offset': 4294974150, 'esil': 'rsi,rbx,=', 'refptr': 0, 'fcn_addr': 4294974144, - 'fcn_last': 4294974159, 'size': 3, 'opcode': 'mov rbx, rsi', 'disasm': 'mov rbx, rsi', - 'bytes': '4889f3', 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9, - 'type2_num': 0}, {'offset': 4294974153, 'ptr': 4294985864, - 'esil': '0x2db8,rip,+,[8],rax,=', 'refptr': 8, - 'fcn_addr': 4294974144, 'fcn_last': 4294974155, 'size': 7, - 'opcode': 'mov rax, qword [rip + 0x2db8]', - 'disasm': 'mov rax, qword [0x100004888]', 'bytes': '488b05b82d0000', - 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9, - 'type2_num': 0, 'refs': [{'addr': 4294985864, 'type': 'DATA', - 'perm': 'r--'}]}, {'offset': 4294974160, - 'esil': 'rax,rip,=', - 'refptr': 0, - 'fcn_addr': 4294974144, - 'fcn_last': 4294974160, - 'size': 2, - 'opcode': 'jmp rax', - 'disasm': 'jmp rax', - 'bytes': 'ffe0', - 'family': 'cpu', - 'type': 'rjmp', - 'reloc': 'False', - 'type_num': 268435458, - 'type2_num': 0}]} + cls.output_path = "malware_asm/" + cls.data_path = path.join(__data__, "5cca32eb8f9c2a024a57ce12e3fb66070662de80") + cls.pdf_dict = { + 'name': 'main', + 'size': 18, + 'addr': 4294974144, + 'ops': [ + { + 'offset': 4294974144, + 'esil': 'rbp,8,rsp,-,=[8],8,rsp,-=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974161, + 'size': 1, + 'opcode': 'push rbp', + 'disasm': 'push rbp', + 'bytes': '55', + 'family': 'cpu', + 'type': 'rpush', + 'reloc': 'False', + 'type_num': 268435468, + 'type2_num': 0, + 'flags': ['main', 'entry0', 'section.0.__TEXT.__text', 'sym.func.100001ac0', 'rip'], + 'comment': 'WzAwXSAtci14IHNlY3Rpb24gc2l6ZSA3Mzc2IG5hbWVkIDAuX19URVhULl9fdGV4dA==' + }, + { + 'offset': 4294974145, + 'esil': 'rsp,rbp,=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974159, + 'size': 3, + 'opcode': 'mov rbp, rsp', + 'disasm': 'mov rbp, rsp', + 'bytes': '4889e5', + 'family': 'cpu', + 'type': 'mov', + 'reloc': 'False', + 'type_num': 9, + 'type2_num': 0 + }, + { + 'offset': 4294974148, + 'esil': 'rbx,8,rsp,-,=[8],8,rsp,-=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974161, + 'size': 1, + 'opcode': 'push rbx', + 'disasm': 'push rbx', + 'bytes': '53', + 'family': 'cpu', + 'type': 'rpush', + 'reloc': 'False', + 'type_num': 268435468, + 'type2_num': 0 + }, + { + 'offset': 4294974149, + 'esil': 'rax,8,rsp,-,=[8],8,rsp,-=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974161, + 'size': 1, + 'opcode': 'push rax', + 'disasm': 'push rax', + 'bytes': '50', + 'family': 'cpu', + 'type': 'rpush', + 'reloc': 'False', + 'type_num': 268435468, + 'type2_num': 0 + }, + { + 'offset': 4294974150, + 'esil': 'rsi,rbx,=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974159, + 'size': 3, + 'opcode': 'mov rbx, rsi', + 'disasm': 'mov rbx, rsi', + 'bytes': '4889f3', + 'family': 'cpu', + 'type': 'mov', + 'reloc': 'False', + 'type_num': 9, + 'type2_num': 0 + }, + { + 'offset': 4294974153, + 'ptr': 4294985864, + 'esil': '0x2db8,rip,+,[8],rax,=', + 'refptr': 8, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974155, + 'size': 7, + 'opcode': 'mov rax, qword [rip + 0x2db8]', + 'disasm': 'mov rax, qword [0x100004888]', + 'bytes': '488b05b82d0000', + 'family': 'cpu', + 'type': 'mov', + 'reloc': 'False', + 'type_num': 9, + 'type2_num': 0, + 'refs': [ + { + 'addr': 4294985864, + 'type': 'DATA', + 'perm': 'r--' + } + ] + }, + { + 'offset': 4294974160, + 'esil': 'rax,rip,=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974160, + 'size': 2, + 'opcode': 'jmp rax', + 'disasm': 'jmp rax', + 'bytes': 'ffe0', + 'family': 'cpu', + 'type': 'rjmp', + 'reloc': 'False', + 'type_num': 268435458, + 'type2_num': 0 + } + ] + } + mkdir(cls.output_path) + + + @classmethod + def tearDownClass(cls) -> None: + rmtree(cls.output_path) def test_sha3(self): """Should return 64-character long string""" @@ -71,17 +163,13 @@ def test_sha3(self): def test_valid_exe_when_valid_magic_bytes(self): """Should return boolean""" - binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80" - filename = Path(binary_location) - magic_bytes = ['cffaedfe'] - self.assertEqual(_valid_exe(filename, magic_bytes), True) + magic_bytes = ["cffaedfe"] + self.assertEqual(_valid_exe(self.data_path, magic_bytes), True) def test_valid_exe_when_not_valid_magic_bytes(self): """Should return boolean""" - binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80" - filename = Path(binary_location) - magic_bytes = ['cafebabe'] - self.assertEqual(_valid_exe(filename, magic_bytes), False) + magic_bytes = ["cafebabe"] + self.assertEqual(_valid_exe(self.data_path, magic_bytes), False) def test_normalize_when_offset(self): """Should return normalized opcode""" @@ -91,7 +179,7 @@ def test_normalize_when_offset(self): def test_normalize_when_no_offset(self): """Should return normalized opcode""" - opcode = 'mov rbx, rsi' + opcode = "mov rbx, rsi" expected_norm_opcode = "mov rbx, rsi" self.assertEqual(_normalize(opcode), expected_norm_opcode) @@ -121,21 +209,21 @@ def test_fn_to_asm_returns_expected_asm(self): self.assertEqual(_fn_to_asm(self.pdf_dict, asm_min), expected_asm) def test_bin_to_asm_returns_expected_number_of_disassembled_files(self): - binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80" asm_minlen = 5 - magic_bytes = ['cffaedfe'] - self.assertEqual(bin_to_asm(Path(binary_location), Path(self.output_path), asm_minlen, magic_bytes), 1) + magic_bytes = ["cffaedfe"] + self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 1) def test_bin_to_asm_returns_expected_number_of_disassembled_files_when_pdfops_shorter_than_minlen(self): - binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80" asm_minlen = 10 magic_bytes = ['cffaedfe'] - self.assertEqual(bin_to_asm(Path(binary_location), self.output_path, asm_minlen, magic_bytes), 0) + self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 0) def test_convert_to_asm_returns_expected_sha1(self): - input_path = 'malware_bin/' + input_path = __data__ asm_minlen_upper = 10 asm_minlen_lower = 5 expected_sha1 = ["5cca32eb8f9c2a024a57ce12e3fb66070662de80"] - self.assertEqual(convert_to_asm(input_path, self.output_path, asm_minlen_upper, asm_minlen_lower), - expected_sha1) + self.assertEqual( + convert_to_asm(input_path, self.output_path, asm_minlen_upper, asm_minlen_lower), + expected_sha1 + ) From 7beb70579e85e778ba18606f82e85a64812594b6 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:56:27 +0100 Subject: [PATCH 44/59] AEGIS-6405 - r2env --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index d92495b..c846480 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ torch>=1.7,<2 click>=7.1,<8 r2pipe>=1.5,<2 +r2env>=0.5.7,<1 From ba5f4086bc26424784209a2ec26a77e903e16294 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Wed, 4 Oct 2023 16:06:39 +0100 Subject: [PATCH 45/59] AEGIS-6405 - radar2 install --- setup.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index be492bc..917afbb 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,25 @@ +import os from setuptools import setup, find_packages +from setuptools.command.install import install as _install from asm2vec.version import VERSION +class install(_install): + @staticmethod + def _setup_radare2() -> None: + if os.system('r2env shell "r2 -v"') == 0: + print("radar2 already set up!") + return + os.system("r2env init") + os.system("r2env add radare2") + os.system("r2env use radare2@git") + + def run(self): + _install.run(self) + self._setup_radare2() + + def readme(): with open('README.md') as f: return f.read() @@ -22,9 +39,10 @@ def read_requirements(): author_email='jamie.nutter@jamf.com', license='MIT License', install_requires=read_requirements(), - packages = find_packages(), + packages=find_packages(), zip_safe=False, include_package_data=True, test_suite='nose.collector', - tests_require=['nose'] + tests_require=['nose'], + cmdclass={'install': install} ) From 9775f0cd12110c96c2766f3cbb41597f84eb8fde Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Wed, 4 Oct 2023 16:27:14 +0100 Subject: [PATCH 46/59] AEGIS-6405 - radar2 test --- setup.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 917afbb..a6e9aea 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ import os +import sys from setuptools import setup, find_packages from setuptools.command.install import install as _install @@ -8,12 +9,12 @@ class install(_install): @staticmethod def _setup_radare2() -> None: - if os.system('r2env shell "r2 -v"') == 0: - print("radar2 already set up!") - return - os.system("r2env init") - os.system("r2env add radare2") - os.system("r2env use radare2@git") + if sys.platform.startswith("linux"): + os.system("apt-get install radare2") + elif sys.platform.startswith("darwin"): + os.system("brew install radare2") + else: + print("Ensure 'radar2' is installed...") def run(self): _install.run(self) From e8735feaa257b45996e3bb6d9871db6bbdae8fb5 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Wed, 4 Oct 2023 17:16:42 +0100 Subject: [PATCH 47/59] AEGIS-6405 - radare2 test 2 --- asm2vec/version.py | 2 ++ setup.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/asm2vec/version.py b/asm2vec/version.py index d6f3f4b..500db07 100644 --- a/asm2vec/version.py +++ b/asm2vec/version.py @@ -1,2 +1,4 @@ VERSION = '1.0.2' DEV_VERSION = '0' + +radare2_version = "5.8.8" diff --git a/setup.py b/setup.py index a6e9aea..e594ee2 100644 --- a/setup.py +++ b/setup.py @@ -3,22 +3,29 @@ from setuptools import setup, find_packages from setuptools.command.install import install as _install -from asm2vec.version import VERSION +from asm2vec.version import VERSION, radare2_version class install(_install): @staticmethod def _setup_radare2() -> None: if sys.platform.startswith("linux"): - os.system("apt-get install radare2") + os.system("apt-get update") + os.system("apt-get install -y --no-install-recommends wget") + os.system(f"wget -O /tmp/radare2_${radare2_version}_arm64.deb https://github.com/radareorg/radare2/releases/download/${radare2_version}/radare2_${radare2_version}_arm64.deb") + os.system(f"dpkg -i /tmp/radare2_${radare2_version}_arm64.deb") + os.system("r2pm init") + os.system("r2pm update") + os.system(f"rm /tmp/radare2_${radare2_version}_arm64.deb") elif sys.platform.startswith("darwin"): os.system("brew install radare2") else: print("Ensure 'radar2' is installed...") def run(self): - _install.run(self) self._setup_radare2() + _install.run(self) + def readme(): From 3a2db7750f8038b0be66a4b3f56abc9b4a62be08 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Wed, 4 Oct 2023 17:41:23 +0100 Subject: [PATCH 48/59] AEGIS-6405 - radare2 test 3 --- requirements.txt | 1 - setup.py | 20 ++++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index c846480..d92495b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ torch>=1.7,<2 click>=7.1,<8 r2pipe>=1.5,<2 -r2env>=0.5.7,<1 diff --git a/setup.py b/setup.py index e594ee2..b5a1ba8 100644 --- a/setup.py +++ b/setup.py @@ -10,13 +10,18 @@ class install(_install): @staticmethod def _setup_radare2() -> None: if sys.platform.startswith("linux"): - os.system("apt-get update") - os.system("apt-get install -y --no-install-recommends wget") - os.system(f"wget -O /tmp/radare2_${radare2_version}_arm64.deb https://github.com/radareorg/radare2/releases/download/${radare2_version}/radare2_${radare2_version}_arm64.deb") - os.system(f"dpkg -i /tmp/radare2_${radare2_version}_arm64.deb") - os.system("r2pm init") - os.system("r2pm update") - os.system(f"rm /tmp/radare2_${radare2_version}_arm64.deb") + commands = [ + "apt-get update", + "apt-get install -y --no-install-recommends wget", + f"wget -O /tmp/radare2_{radare2_version}_arm64.deb https://github.com/radareorg/radare2/releases/download/{radare2_version}/radare2_{radare2_version}_arm64.deb", + f"dpkg -i /tmp/radare2_{radare2_version}_arm64.deb", + "r2pm init", + "r2pm update", + f"rm /tmp/radare2_{radare2_version}_arm64.deb" + ] + for command in commands: + if os.system(command) != 0: + raise Exception(f"Install radare2 failed: '{command}'") elif sys.platform.startswith("darwin"): os.system("brew install radare2") else: @@ -27,7 +32,6 @@ def run(self): _install.run(self) - def readme(): with open('README.md') as f: return f.read() From d36eced48ea68c97de849a59fa4679898512ab30 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Wed, 4 Oct 2023 18:19:58 +0100 Subject: [PATCH 49/59] AEGIS-6405 - setup arch --- setup.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index b5a1ba8..19a3051 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ import os import sys +import platform from setuptools import setup, find_packages from setuptools.command.install import install as _install @@ -9,21 +10,29 @@ class install(_install): @staticmethod def _setup_radare2() -> None: - if sys.platform.startswith("linux"): + if sys.platform.startswith("linux"): # Install required in Docker images + machine = platform.machine() + if machine in ["aarch64", "arm"]: + architecture = "arm64" + elif machine in ["x86_64"]: + architecture = "amd64" + elif machine in ["i386", "i686"]: + architecture = "i386" + else: + raise Exception(f"No architecture for Linux Machine: '{machine}'") + commands = [ "apt-get update", "apt-get install -y --no-install-recommends wget", - f"wget -O /tmp/radare2_{radare2_version}_arm64.deb https://github.com/radareorg/radare2/releases/download/{radare2_version}/radare2_{radare2_version}_arm64.deb", - f"dpkg -i /tmp/radare2_{radare2_version}_arm64.deb", + f"wget -O /tmp/radare2_{radare2_version}_{architecture}.deb https://github.com/radareorg/radare2/releases/download/{radare2_version}/radare2_{radare2_version}_{architecture}.deb", + f"dpkg -i /tmp/radare2_{radare2_version}_{architecture}.deb", "r2pm init", "r2pm update", - f"rm /tmp/radare2_{radare2_version}_arm64.deb" + f"rm /tmp/radare2_{radare2_version}_{architecture}.deb" ] for command in commands: if os.system(command) != 0: raise Exception(f"Install radare2 failed: '{command}'") - elif sys.platform.startswith("darwin"): - os.system("brew install radare2") else: print("Ensure 'radar2' is installed...") From 7ab939b6e5a77eb17107313af3516176d5a4df72 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Thu, 5 Oct 2023 11:16:42 +0100 Subject: [PATCH 50/59] AEGIS-6406 - moved scripts --- README.md | 166 ++++-------------------------------------- asm2vec/__init__.py | 5 +- asm2vec/data.py | 43 +++++++++++ asm2vec/model.py | 32 +++++++- asm2vec/similarity.py | 42 +++++++++++ asm2vec/test.py | 39 ++++++++++ asm2vec/train.py | 162 ++++++----------------------------------- asm2vec/utilities.py | 55 ++++++++++++++ requirements.txt | 1 - scripts/compare.py | 44 ----------- scripts/test.py | 44 ----------- 11 files changed, 251 insertions(+), 382 deletions(-) create mode 100644 asm2vec/data.py create mode 100644 asm2vec/similarity.py create mode 100644 asm2vec/test.py create mode 100644 asm2vec/utilities.py delete mode 100644 scripts/compare.py delete mode 100644 scripts/test.py diff --git a/README.md b/README.md index c5fc4ae..637d5db 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # asm2vec-pytorch -release 1.0.0 +release 1.0.3 mit python @@ -9,30 +9,17 @@ The details of the model can be found in the original paper: [(sp'19) Asm2Vec: B ## Requirements -python >= 3.10 - -| packages | for | -| --- | --- | -| r2pipe | `scripts/bin2asm.py` | -| click | `scripts/*` | -| torch | almost all code need it | - -You also need to install `radare2` to run `scripts/bin2asm.py`. `r2pipe` is just the python interface to `radare2` - -If you only want to use the library code, you just need to install `torch` +* python >= 3.10 +* radare2 +* Packages listed in `requirements.txt` ## Install ``` +pip install -r requirements.txt && python setup.py install ``` -or - -``` -pip install git+https://github.com/oalieno/asm2vec-pytorch.git -``` - ## Benchmark An implementation already exists here: [Lancern/asm2vec](https://github.com/Lancern/asm2vec) @@ -46,141 +33,20 @@ Following is the benchmark of training 1000 functions in 1 epoch. ## Get Started -```bash -python scripts/bin2asm.py -i /bin/ -o asm/ -``` - -First generate asm files from binarys under `/bin/`. -You can hit `Ctrl+C` anytime when there is enough data. - -```bash -python scripts/train.py -i asm/ -l 100 -o model.pt --epochs 100 -``` - -Try to train the model using only 100 functions and 100 epochs for a taste. -Then you can use more data if you want. - -```bash -python scripts/test.py -i asm/123456 -m model.pt -``` - -After you train your model, try to grab an assembly function and see the result. -This script will show you how the model perform. -Once you satisfied, you can take out the embedding vector of the function and do whatever you want with it. +### TODO - update this with description about to how use etc -## Usage +## Tests -### bin2asm.py +### Run test suite -``` -Usage: bin2asm.py [OPTIONS] +* Run all tests: ``python -m unittest discover -v`` +* Run a certain module's tests: ``python -m unittest -v test.test_binary_to_asm`` +* Run a certain test class: ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm`` +* Run a certain test method: - Extract assembly functions from binary executable + ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm.test_sha3`` -Options: - -i, --input TEXT input directory / file [required] - -o, --output TEXT output directory - -l, --len INTEGER ignore assembly code with instructions amount smaller - than minlen +### Coverage - --help Show this message and exit. -``` - -```bash -# Example -python bin2asm.py -i /bin/ -o asm/ -``` - -### train.py - -``` -Usage: train.py [OPTIONS] - -Options: - -i, --input TEXT training data folder [required] - -o, --output TEXT output model path [default: model.pt] - -m, --model TEXT load previous trained model path - -l, --limit INTEGER limit the number of functions to be loaded - -d, --ebedding-dimension INTEGER - embedding dimension [default: 100] - -b, --batch-size INTEGER batch size [default: 1024] - -e, --epochs INTEGER training epochs [default: 10] - -n, --neg-sample-num INTEGER negative sampling amount [default: 25] - -a, --calculate-accuracy whether calculate accuracy ( will be - significantly slower ) - - -c, --device TEXT hardware device to be used: cpu / cuda / - auto [default: auto] - - -lr, --learning-rate FLOAT learning rate [default: 0.02] - --help Show this message and exit. -``` - -```bash -# Example -python train.py -i asm/ -o model.pt --epochs 100 -``` - -### test.py - -``` -Usage: test.py [OPTIONS] - -Options: - -i, --input TEXT target function [required] - -m, --model TEXT model path [required] - -e, --epochs INTEGER training epochs [default: 10] - -n, --neg-sample-num INTEGER negative sampling amount [default: 25] - -l, --limit INTEGER limit the amount of output probability result - -c, --device TEXT hardware device to be used: cpu / cuda / auto - [default: auto] - - -lr, --learning-rate FLOAT learning rate [default: 0.02] - -p, --pretty pretty print table [default: False] - --help Show this message and exit. -``` - -```bash -# Example -python test.py -i asm/123456 -m model.pt -``` - -``` -┌──────────────────────────────────────────┐ -│ endbr64 │ -│ ➔ push r15 │ -│ push r14 │ -├────────┬─────────────────────────────────┤ -│ 34.68% │ [rdx + rsi*CONST + CONST] │ -│ 20.29% │ push │ -│ 16.22% │ r15 │ -│ 04.36% │ r14 │ -│ 03.55% │ r11d │ -└────────┴─────────────────────────────────┘ -``` - -### compare.py - -``` -Usage: compare.py [OPTIONS] - -Options: - -i1, --input1 TEXT target function 1 [required] - -i2, --input2 TEXT target function 2 [required] - -m, --model TEXT model path [required] - -e, --epochs INTEGER training epochs [default: 10] - -c, --device TEXT hardware device to be used: cpu / cuda / auto - [default: auto] - - -lr, --learning-rate FLOAT learning rate [default: 0.02] - --help Show this message and exit. -``` - -```bash -# Example -python compare.py -i1 asm/123456 -i2 asm/654321 -m model.pt -e 30 -``` - -``` -cosine similarity : 0.873684 -``` +* Create report: ``coverage run -m unittest discover -v`` +* Read report: ``coverage report -m`` \ No newline at end of file diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py index 2d9cfd9..6e9d963 100644 --- a/asm2vec/__init__.py +++ b/asm2vec/__init__.py @@ -3,4 +3,7 @@ __home__ = os.path.dirname(os.path.abspath(__path__[0])) __data__ = os.path.join(__home__, "data") -__all__ = ["__data__", "__home__", "binary_to_asm", "datatype", "model", "tensors", "train", "utils", "version"] +__all__ = [ + "__data__", "__home__", "binary_to_asm", "data", "datatype", "model", "similarity", "tensors", "test", "train", + "utilities", "version" +] diff --git a/asm2vec/data.py b/asm2vec/data.py new file mode 100644 index 0000000..6713c38 --- /dev/null +++ b/asm2vec/data.py @@ -0,0 +1,43 @@ +import os +from pathlib import Path +from torch.utils.data import Dataset + +from asm2vec.datatype import Tokens, Function + + +class AsmDataset(Dataset): + # TODO - doc string - explain what this class does - how does it extend `Dataset`? + def __init__(self, x, y): + self.x = x + self.y = y + + def __len__(self): + return len(self.x) + + def __getitem__(self, index): + return self.x[index], self.y[index] + + +def load_data(paths, limit=None): + # TODO - doc string + if type(paths) is not list: + paths = [paths] + + filenames = [] + for path in paths: + if os.path.isdir(path): + filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) + if os.path.isfile(Path(path) / filename)] + else: + filenames += [Path(path)] + + functions, tokens = [], Tokens() + for i, filename in enumerate(filenames): + if limit and i >= limit: + break + with open(filename) as f: + fn = Function.load(f.read()) + functions.append(fn) + tokens.add(fn.tokens()) + + return functions, tokens diff --git a/asm2vec/model.py b/asm2vec/model.py index 74a6ace..51dc433 100644 --- a/asm2vec/model.py +++ b/asm2vec/model.py @@ -1,9 +1,14 @@ import torch import torch.nn as nn +from asm2vec.datatype import Tokens + bce, sigmoid, softmax = nn.BCELoss(), nn.Sigmoid(), nn.Softmax(dim=1) +# TODO - doc strings + + class ASM2VEC(nn.Module): def __init__(self, vocab_size, function_size, embedding_size): super(ASM2VEC, self).__init__() @@ -44,9 +49,34 @@ def forward(self, inp, pos, neg): label = torch.cat([torch.ones(batch_size, 3), torch.zeros(batch_size, neg.shape[1])], dim=1).to(device) return bce(sigmoid(pred), label) - def predict(self, inp, pos): + def predict(self, inp, pos): # Why is pos not used? Why does Predict differ so much from Forward? device, batch_size = inp.device, inp.shape[0] v = self.v(inp) probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1). to(device)), v).squeeze(dim=2) return softmax(probs) + + +def save_model(path: str, model: ASM2VEC, tokens: Tokens) -> None: + torch.save( + { + 'model_params': ( + model.embeddings.num_embeddings, + model.embeddings_f.num_embeddings, + model.embeddings.embedding_dim + ), + 'model': model.state_dict(), + 'tokens': tokens.state_dict(), + }, + path + ) + + +def load_model(path: str, device: str = 'cpu') -> tuple[ASM2VEC, Tokens]: + checkpoint = torch.load(path, map_location=device) + tokens = Tokens() + tokens.load_state_dict(checkpoint['tokens']) + model = ASM2VEC(*checkpoint['model_params']) + model.load_state_dict(checkpoint['model']) + model = model.to(device) + return model, tokens diff --git a/asm2vec/similarity.py b/asm2vec/similarity.py new file mode 100644 index 0000000..bce31b6 --- /dev/null +++ b/asm2vec/similarity.py @@ -0,0 +1,42 @@ +import torch + +from asm2vec.data import load_data +from asm2vec.model import load_model +from asm2vec.train import train + + +def cosine_similarity(v1, v2) -> float: + return (v1 @ v2 / (v1.norm() * v2.norm())).item() + + +def compare_two( + data_path_1: str, data_path_2: str, model_path: str, epochs: int = 10, device: str = "cpu", + learning_rate: float = 0.02 +) -> float: + # TODO - doc string + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + + # load model, tokens + model, tokens = load_model(model_path, device=device) + functions, tokens_new = load_data([data_path_1, data_path_2]) + tokens.update(tokens_new) + model.update(2, tokens.size()) + model = model.to(device) + + # train function embedding + model = train( + functions, + tokens, + model=model, + epochs=epochs, + device=device, + mode="test", + learning_rate=learning_rate + ) + + # compare 2 function vectors + v1, v2 = model.to("cpu").embeddings_f(torch.tensor([0, 1])) + similarity = cosine_similarity(v1, v2) + print(f"cosine similarity : {similarity:.6f}") + return similarity diff --git a/asm2vec/test.py b/asm2vec/test.py new file mode 100644 index 0000000..c4ef7ba --- /dev/null +++ b/asm2vec/test.py @@ -0,0 +1,39 @@ +import torch + +from asm2vec.data import load_data +from asm2vec.model import load_model +from asm2vec.train import train, preprocess +from asm2vec.utilities import show_probs + + +def test_model( + data_path: str, model_path: str, epochs: int = 10, neg_sample_num: int = 25, limit: int | None = None, + device: str = "cpu", learning_rate: float = 0.02, pretty: bool = False +) -> None: + # TODO - doc string + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + + # load model, tokens + model, tokens = load_model(model_path, device=device) + functions, tokens_new = load_data(data_path) + tokens.update(tokens_new) + model.update(1, tokens.size()) + model = model.to(device) + + # train function embedding + model = train( + functions, + tokens, + model=model, + epochs=epochs, + neg_sample_num=neg_sample_num, + device=device, + mode="test", + learning_rate=learning_rate + ) + + # show predicted probability results + x, y = preprocess(functions, tokens) + probs = model.predict(x.to(device), y.to(device)) + show_probs(x, y, probs, tokens, limit=limit, pretty=pretty) diff --git a/asm2vec/train.py b/asm2vec/train.py index 12b8fe5..eb418d4 100644 --- a/asm2vec/train.py +++ b/asm2vec/train.py @@ -1,49 +1,12 @@ -import os import time import torch -import logging from pathlib import Path -from torch.utils.data import DataLoader, Dataset -from asm2vec.model import ASM2VEC -from asm2vec.datatype import Tokens, Function, Instruction +from torch.utils.data import DataLoader -logging.basicConfig(level=logging.INFO, format='%(message)s') - - -class AsmDataset(Dataset): - def __init__(self, x, y): - self.x = x - self.y = y - - def __len__(self): - return len(self.x) - - def __getitem__(self, index): - return self.x[index], self.y[index] - - -def load_data(paths, limit=None): - if type(paths) is not list: - paths = [paths] - - filenames = [] - for path in paths: - if os.path.isdir(path): - filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) - if os.path.isfile(Path(path) / filename)] - else: - filenames += [Path(path)] - - functions, tokens = [], Tokens() - for i, filename in enumerate(filenames): - if limit and i >= limit: - break - with open(filename) as f: - fn = Function.load(f.read()) - functions.append(fn) - tokens.add(fn.tokens()) - - return functions, tokens +from asm2vec.data import AsmDataset, load_data +from asm2vec.datatype import Function, Tokens +from asm2vec.model import ASM2VEC, load_model, save_model +from asm2vec.utilities import accuracy, callback def preprocess(functions, tokens): @@ -57,19 +20,12 @@ def preprocess(functions, tokens): def train( - functions, - tokens, - model=None, - embedding_size=100, - batch_size=1024, - epochs=10, - neg_sample_num=25, - calc_acc=False, - device='cpu', - mode='train', - callback=None, - learning_rate=0.02 + functions: list[Function], tokens: Tokens, model: ASM2VEC | None = None, embedding_size: int = 100, + batch_size: int = 1024, epochs: int = 10, neg_sample_num: int = 25, calc_acc: bool = False, device: str = 'cpu', + mode: str = 'train', verbose: bool = False, learning_rate: float = 0.02 ): + # TODO: doc string + # TODO: test mode in train... this is confusing! if mode == 'train': if model is None: model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device) @@ -100,7 +56,7 @@ def train( probs = model.predict(inp.to(device), pos.to(device)) accs.append(accuracy(pos, probs)) - if callback: + if verbose: callback({ 'model': model, 'tokens': tokens, @@ -113,98 +69,22 @@ def train( return model -def save_model(path, model, tokens): - torch.save({ - 'model_params': ( - model.embeddings.num_embeddings, - model.embeddings_f.num_embeddings, - model.embeddings.embedding_dim - ), - 'model': model.state_dict(), - 'tokens': tokens.state_dict(), - }, path) - - -def load_model(path, device='cpu'): - checkpoint = torch.load(path, map_location=device) - tokens = Tokens() - tokens.load_state_dict(checkpoint['tokens']) - model = ASM2VEC(*checkpoint['model_params']) - model.load_state_dict(checkpoint['model']) - model = model.to(device) - return model, tokens - - -def show_probs(x, y, probs, tokens, limit=None, pretty=False): - if pretty: - tl, tr, bl, br = '┌', '┐', '└', '┘' - lm, rm, tm, bm = '├', '┤', '┬', '┴' - h, v = '─', '│' - arrow = ' ➔' - else: - tl, tr, bl, br = '+', '+', '+', '+' - lm, rm, tm, bm = '+', '+', '+', '+' - h, v = '-', '|' - arrow = '->' - top = probs.topk(5) - for i, (xi, yi) in enumerate(zip(x, y)): - if limit and i >= limit: - break - xi, yi = xi.tolist(), yi.tolist() - print(tl + h * 42 + tr) - print(f'{v} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}') - print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}') - print(f'{v} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}') - print(lm + h * 8 + tm + h * 33 + rm) - for value, index in zip(top.values[i], top.indices[i]): - if index in yi: - colorbegin, colorclear = '\033[92m', '\033[0m' - else: - colorbegin, colorclear = '', '' - print(f'{v} {colorbegin}{value * 100:05.2f}%{colorclear} {v} {colorbegin}' - f'{tokens[index.item()].name:31}{colorclear} {v}') - print(bl + h * 8 + bm + h * 33 + br) - - -def accuracy(y, probs): - return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)])) - - -def callback(context) -> None: - """Prettifies the display of accuracy, if chosen - """ - progress = f'{context["epoch"]} | time = {context["time"]:.2f},\ - loss = {context["loss"]:.4f}' - - if context["accuracy"]: - progress += f', accuracy = {context["accuracy"]:.4f}' - logging.info(f"{progress}") - - def train_asm2vec_model( - train_set: str, - new_model: str, - model_path: str | None, - epochs: int, - limit: int = None, - calc_acc: bool = False, - embedding_size: int = 100, - batch_size: int = 1024, - neg_sample: int = 25, - learning_rate: float = 0.02, - device: str = 'cpu' + train_set: str, new_model: str, model_path: str | None, epochs: int, limit: int | None = None, + calc_acc: bool = False, embedding_size: int = 100, batch_size: int = 1024, neg_sample: int = 25, + learning_rate: float = 0.02, device: str = 'cpu' ) -> ASM2VEC: - - """Trains an asm2vec model + # TODO - this is just a wrapper - can we do this smarter? + """Trains an ASM2VEC model :param train_set: path to the training dataset :param new_model: path to the model to be trained :param model_path: path to already trained model - :param limit: number of the assembly functions that the model will be trained on; - if not defined, all the assembly functions in train_set_path + :param limit: number of the assembly functions that the model will be trained on; if not defined, all the assembly + functions in train_set_path :param epochs: number of epochs :param calc_acc: displays the accuracy per training epoch; setting it to True will slow down the training - :param embedding_size: size of the vector representation for a token; an assembly function - will be represented with a vector twice that size + :param embedding_size: size of the vector representation for a token; an assembly function will be represented + with a vector twice that size :param batch_size: the size of batches for training :param neg_sample: negative sampling amount :param device: 'auto' | 'cuda' | 'cpu' @@ -233,7 +113,7 @@ def train_asm2vec_model( neg_sample_num=neg_sample, calc_acc=calc_acc, device=device, - callback=callback, + verbose=True, learning_rate=learning_rate ) save_model(new_model, model, tokens) diff --git a/asm2vec/utilities.py b/asm2vec/utilities.py new file mode 100644 index 0000000..dd39aac --- /dev/null +++ b/asm2vec/utilities.py @@ -0,0 +1,55 @@ +import logging +import torch + +from asm2vec.datatype import Instruction + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +# TODO - Why do we have both logging and print? +# TODO - Doc strings + +def accuracy(y, probs): + return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)])) + + +def callback(context) -> None: + """Prettifies the display of accuracy, if chosen + """ + progress = f'{context["epoch"]} | time = {context["time"]:.2f},\ + loss = {context["loss"]:.4f}' + + if context["accuracy"]: + progress += f', accuracy = {context["accuracy"]:.4f}' + logging.info(f"{progress}") + + +def show_probs(x, y, probs, tokens, limit=None, pretty=False): + if pretty: + tl, tr, bl, br = '┌', '┐', '└', '┘' + lm, rm, tm, bm = '├', '┤', '┬', '┴' + h, v = '─', '│' + arrow = ' ➔' + else: + tl, tr, bl, br = '+', '+', '+', '+' + lm, rm, tm, bm = '+', '+', '+', '+' + h, v = '-', '|' + arrow = '->' + top = probs.topk(5) + for i, (xi, yi) in enumerate(zip(x, y)): + if limit and i >= limit: + break + xi, yi = xi.tolist(), yi.tolist() + print(tl + h * 42 + tr) + print(f'{v} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}') + print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}') + print(f'{v} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}') + print(lm + h * 8 + tm + h * 33 + rm) + for value, index in zip(top.values[i], top.indices[i]): + if index in yi: + colorbegin, colorclear = '\033[92m', '\033[0m' + else: + colorbegin, colorclear = '', '' + print(f'{v} {colorbegin}{value * 100:05.2f}%{colorclear} {v} {colorbegin}' + f'{tokens[index.item()].name:31}{colorclear} {v}') + print(bl + h * 8 + bm + h * 33 + br) diff --git a/requirements.txt b/requirements.txt index d92495b..3163633 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ torch>=1.7,<2 -click>=7.1,<8 r2pipe>=1.5,<2 diff --git a/scripts/compare.py b/scripts/compare.py deleted file mode 100644 index 3860b83..0000000 --- a/scripts/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -import torch.nn as nn -import click -import asm2vec - -def cosine_similarity(v1, v2): - return (v1 @ v2 / (v1.norm() * v2.norm())).item() - -@click.command() -@click.option('-i1', '--input1', 'ipath1', help='target function 1', required=True) -@click.option('-i2', '--input2', 'ipath2', help='target function 2', required=True) -@click.option('-m', '--model', 'mpath', help='model path', required=True) -@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True) -@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True) -@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True) -def cli(ipath1, ipath2, mpath, epochs, device, lr): - if device == 'auto': - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - # load model, tokens - model, tokens = asm2vec.utils.load_model(mpath, device=device) - functions, tokens_new = asm2vec.utils.load_data([ipath1, ipath2]) - tokens.update(tokens_new) - model.update(2, tokens.size()) - model = model.to(device) - - # train function embedding - model = asm2vec.utils.train( - functions, - tokens, - model=model, - epochs=epochs, - device=device, - mode='test', - learning_rate=lr - ) - - # compare 2 function vectors - v1, v2 = model.to('cpu').embeddings_f(torch.tensor([0, 1])) - - print(f'cosine similarity : {cosine_similarity(v1, v2):.6f}') - -if __name__ == '__main__': - cli() diff --git a/scripts/test.py b/scripts/test.py deleted file mode 100644 index 31372aa..0000000 --- a/scripts/test.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -import torch.nn as nn -import click -import asm2vec - -@click.command() -@click.option('-i', '--input', 'ipath', help='target function', required=True) -@click.option('-m', '--model', 'mpath', help='model path', required=True) -@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True) -@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True) -@click.option('-l', '--limit', help='limit the amount of output probability result', type=int) -@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True) -@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True) -@click.option('-p', '--pretty', default=False, help='pretty print table', show_default=True, is_flag=True) -def cli(ipath, mpath, epochs, neg_sample_num, limit, device, lr, pretty): - if device == 'auto': - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - # load model, tokens - model, tokens = asm2vec.utils.load_model(mpath, device=device) - functions, tokens_new = asm2vec.utils.load_data(ipath) - tokens.update(tokens_new) - model.update(1, tokens.size()) - model = model.to(device) - - # train function embedding - model = asm2vec.utils.train( - functions, - tokens, - model=model, - epochs=epochs, - neg_sample_num=neg_sample_num, - device=device, - mode='test', - learning_rate=lr - ) - - # show predicted probability results - x, y = asm2vec.utils.preprocess(functions, tokens) - probs = model.predict(x.to(device), y.to(device)) - asm2vec.utils.show_probs(x, y, probs, tokens, limit=limit, pretty=pretty) - -if __name__ == '__main__': - cli() From 8f572c35e39b980bc544e258abcffebe87fcb0e9 Mon Sep 17 00:00:00 2001 From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com> Date: Thu, 5 Oct 2023 11:51:52 +0100 Subject: [PATCH 51/59] TRIVIAL - doc strings --- asm2vec/binary_to_asm.py | 61 ++++++++++++++++++++++------------------ asm2vec/datatype.py | 2 ++ asm2vec/tensors.py | 30 ++++++++++---------- 3 files changed, 50 insertions(+), 43 deletions(-) diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py index 3c141d9..58ccaa1 100644 --- a/asm2vec/binary_to_asm.py +++ b/asm2vec/binary_to_asm.py @@ -9,16 +9,19 @@ def _sha3(asm: str) -> str: - """Produces SHA3 for each assembly function - :param asm: input assembly function + """ + Produces SHA3 for each assembly function + :param asm: Input assembly function + :return: Hashed string """ return hashlib.sha3_256(asm.encode()).hexdigest() def _valid_exe(filename: str, magic_bytes: list[str]) -> bool: - """Extracts magic bytes and returns the header - :param filename: name of the malware file (SHA1) - :param magic_bytes for the specific OS/type of binary + """ + Extracts magic bytes and returns the header + :param filename: Name of the malware file (SHA1) + :param magic_bytes: For the specific OS/type of binary :return: Boolean of the header existing in magic bytes """ magics = [bytes.fromhex(i) for i in magic_bytes] @@ -28,8 +31,10 @@ def _valid_exe(filename: str, magic_bytes: list[str]) -> bool: def _normalize(opcode: str) -> str: - """ Normalizes the input string - :param opcode: opcode of the binary + """ + Normalizes the input opcode string + :param opcode: Opcode of the binary + :return Normalized opcode string """ opcode = opcode.replace(' - ', ' + ') opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode) @@ -39,9 +44,11 @@ def _normalize(opcode: str) -> str: def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str: - """Converts functions to assembly code + """ + Converts functions to assembly code :param pdf: disassembly :param asm_minlen: minimum length of assembly functions to be extracted + :return: ASM string """ if pdf is None: return '' @@ -71,7 +78,8 @@ def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str: def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes: list[str]) -> int: - """Fragments the input binary into assembly functions via r2pipe + """ + Fragments the input binary into assembly functions via r2pipe :param filename: name of the malware file (SHA1) :param output_path: path to the folder to store the assembly functions for each malware :param asm_minlen: the minimum length of assembly functions to be extracted @@ -102,25 +110,22 @@ def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes: return count -def convert_to_asm(input_path: str, - output_path: str, - minlen_upper: int, - minlen_lower: int, - magic_bytes: list[str] = None - ) -> list: - """ Extracts assembly functions from malware files and saves them - into separate folder per binary - :param input_path: the path to the malware binaries - :param output_path: the path for the assembly functions to be extracted - :param minlen_upper: The minimum number of assembly functions needed for disassembling - :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number - of assembly functions to minlen_lower - :param magic_bytes: list of valid for the specific OS/type of binary; e.g. - 'cffaedfe' for Mach-O Little Endian (64-bit) - 'feedfacf' for Mach-O Big Endian (64-bit) - 'cefaedfe' for Mach-O Little Endian (32-bit) - 'feedface': Mach-O Big Endian (32-bit) - 'cafebabe' Universal Binary Big Endian +def convert_to_asm( + input_path: str, output_path: str, minlen_upper: int, minlen_lower: int, magic_bytes: list[str] = None +) -> list: + """ + Extracts assembly functions from malware files and saves them into separate folder per binary + :param input_path: Path to the malware binaries + :param output_path: Path for the assembly functions to be extracted + :param minlen_upper: Minimum number of assembly functions needed for disassembling + :param minlen_lower: If disassembling is not possible with minlen_upper, lower the minimum number of assembly + functions to minlen_lower (WHAT?) + :param magic_bytes: List of valid for the specific OS/type of binary, e.g. + - 'cffaedfe' for Mach-O Little Endian (64-bit) + - 'feedfacf' for Mach-O Big Endian (64-bit) + - 'cefaedfe' for Mach-O Little Endian (32-bit) + - 'feedface': Mach-O Big Endian (32-bit) + - 'cafebabe' Universal Binary Big Endian :return: List of sha1 of disassembled malware files """ if not magic_bytes: diff --git a/asm2vec/datatype.py b/asm2vec/datatype.py index b6451d8..f618800 100644 --- a/asm2vec/datatype.py +++ b/asm2vec/datatype.py @@ -2,6 +2,8 @@ import random import warnings +# TODO - doc strings + class Token: def __init__(self, name, index): diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py index 01b306f..fde5296 100644 --- a/asm2vec/tensors.py +++ b/asm2vec/tensors.py @@ -1,26 +1,26 @@ import os import torch import logging -from asm2vec.train import train, load_model, load_data from pathlib import Path +from asm2vec.train import train, load_model, load_data + logging.basicConfig(level=logging.INFO, format='%(message)s') -def calc_tensors(asm_path: str, - tensor_path: str, - model_path: str, - epochs: int, - device: str = 'cpu', - learning_rate: float = 0.02) -> list: - """Calculates vector representation of a binary as the mean per column - of the vector representations of its assembly functions - :param asm_path: folder with assembly function in a subfolder per binary - :param tensor_path: folder to store the tensors - :param model_path: path to the trained model - :param epochs: number of epochs - :param device: 'auto' | 'cuda' | 'cpu' - :param learning_rate: learning rate +def calc_tensors( + asm_path: str, tensor_path: str, model_path: str, epochs: int, device: str = 'cpu', learning_rate: float = 0.02 +) -> list: + """ + Calculates vector representation of a binary as the mean per column of the vector representations of its assembly + functions. + :param asm_path: Path to folder with assembly function in a sub-folder per binary + :param tensor_path: Path to folder to store the tensors + :param model_path: Path to the trained model + :param epochs: Number of epochs + :param device: 'auto' | 'cuda' | 'cpu' + :param learning_rate: Learning rate + :return: List of tensors """ tensors_list = [] if device == 'auto': From 1cf86db35d3645781f2408cab3653621fdd02869 Mon Sep 17 00:00:00 2001 From: "CI2.0" Date: Thu, 5 Oct 2023 15:03:00 +0000 Subject: [PATCH 52/59] [Jenkins] Set version to 1.0.3 --- asm2vec/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/version.py b/asm2vec/version.py index 500db07..c85dc7e 100644 --- a/asm2vec/version.py +++ b/asm2vec/version.py @@ -1,4 +1,4 @@ -VERSION = '1.0.2' +VERSION = '1.0.3' DEV_VERSION = '0' radare2_version = "5.8.8" From 9d794e25c4c2604a617d3d099000ecc38c6eee6f Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 23 Oct 2023 12:15:38 +0200 Subject: [PATCH 53/59] AEGIS-6406 rename "test" mode to "update" --- asm2vec/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/test.py b/asm2vec/test.py index c4ef7ba..b80cc14 100644 --- a/asm2vec/test.py +++ b/asm2vec/test.py @@ -29,7 +29,7 @@ def test_model( epochs=epochs, neg_sample_num=neg_sample_num, device=device, - mode="test", + mode="update", learning_rate=learning_rate ) From 2fec9d1280cd89880676594398ac016ae12d26d1 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 23 Oct 2023 12:19:49 +0200 Subject: [PATCH 54/59] AEGIS-6406 add docstring, change "test" mode to "update" mode --- asm2vec/train.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/asm2vec/train.py b/asm2vec/train.py index eb418d4..4de7a81 100644 --- a/asm2vec/train.py +++ b/asm2vec/train.py @@ -2,7 +2,6 @@ import torch from pathlib import Path from torch.utils.data import DataLoader - from asm2vec.data import AsmDataset, load_data from asm2vec.datatype import Function, Tokens from asm2vec.model import ASM2VEC, load_model, save_model @@ -24,15 +23,28 @@ def train( batch_size: int = 1024, epochs: int = 10, neg_sample_num: int = 25, calc_acc: bool = False, device: str = 'cpu', mode: str = 'train', verbose: bool = False, learning_rate: float = 0.02 ): - # TODO: doc string - # TODO: test mode in train... this is confusing! + """This function trains a model on the given assembly functions and tokens + :param functions: list of assembly functions + :param tokens: tokens (operations, operands) of the assembly function + :param model: type of the model; ; (Optional, default ASM2VEC) + :param embedding_size: size of the tensor representation of an assembly function; (Optional, default value = 100) + :param batch_size: size of the batch for each epoch of training; (Optional, default value = 1024) + :param epochs: number of epochs for training the model; (Optional, default value = 10) + :param neg_sample_num: size of the negative sample; (Optional, default value = 25) + :param calc_acc: if set to True, the accuracy per training epoch is displayed; (Optional, default False) + :param device: the device used for processing; (Optional, default 'cpu') + :param mode: 'train' (to train a new model) | 'update' (to add to an already trained model's dictionary); + (Optional, default 'train') + :param verbose: if True performs training in verbose mode; (Optional, default False) + :param learning_rate: learning rate + """ if mode == 'train': if model is None: model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) - elif mode == 'test': + elif mode == 'update': if model is None: - raise ValueError("test mode required pretrained model") + raise ValueError("Update mode requires a pretrained model") optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate) else: raise ValueError("Unknown mode") @@ -89,6 +101,7 @@ def train_asm2vec_model( :param neg_sample: negative sampling amount :param device: 'auto' | 'cuda' | 'cpu' :param learning_rate: learning rate + :return an ASM2VEC model """ if device == 'auto': From 3c9833c921d9fc4812f94a232ef34043b225b592 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 23 Oct 2023 12:29:44 +0200 Subject: [PATCH 55/59] AEGIS-6406 add docstring, set mode to "update" --- asm2vec/similarity.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/asm2vec/similarity.py b/asm2vec/similarity.py index bce31b6..ea52327 100644 --- a/asm2vec/similarity.py +++ b/asm2vec/similarity.py @@ -13,30 +13,36 @@ def compare_two( data_path_1: str, data_path_2: str, model_path: str, epochs: int = 10, device: str = "cpu", learning_rate: float = 0.02 ) -> float: - # TODO - doc string + """This function produces the cosine similarity of a pair of assembly functions + :param data_path_1: the path to the assembly function no. 1 + :param data_path_2: the path to the assembly function no. 2 + :param model_path: the path to the trained asm2vec model + :param epochs: the number of epochs for calculating the tensor representations; (Optional, default = 10) + :param device: 'auto' | 'cuda' | 'cpu' (Optional, default 'cpu') + :param learning_rate: learning rate; (Optional; default = 0.02) + :return the cosine similarity value + """ if device == "auto": device = "cuda" if torch.cuda.is_available() else "cpu" - # load model, tokens model, tokens = load_model(model_path, device=device) functions, tokens_new = load_data([data_path_1, data_path_2]) tokens.update(tokens_new) model.update(2, tokens.size()) model = model.to(device) - - # train function embedding + model = train( functions, tokens, model=model, epochs=epochs, device=device, - mode="test", + mode="update", learning_rate=learning_rate ) - # compare 2 function vectors v1, v2 = model.to("cpu").embeddings_f(torch.tensor([0, 1])) similarity = cosine_similarity(v1, v2) - print(f"cosine similarity : {similarity:.6f}") + print(f"Cosine similarity : {similarity:.6f}") + return similarity From 2a8433a97a8b3840022eb02a347d17aaa6f918f1 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Mon, 23 Oct 2023 14:39:23 +0200 Subject: [PATCH 56/59] AEGIS-6406 change mode from "test" to "update" --- asm2vec/tensors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py index fde5296..78a356e 100644 --- a/asm2vec/tensors.py +++ b/asm2vec/tensors.py @@ -55,7 +55,7 @@ def calc_tensors( model=model, epochs=epochs, device=device, - mode='test', + mode='update', learning_rate=learning_rate ) From 45e10f744047cdfc8eac40a7d7e808a03be6c4c3 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Wed, 25 Oct 2023 12:04:36 +0200 Subject: [PATCH 57/59] AEGIS-6406 add identation Disassemble only if the folder does not exist --- asm2vec/binary_to_asm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py index 58ccaa1..70218d7 100644 --- a/asm2vec/binary_to_asm.py +++ b/asm2vec/binary_to_asm.py @@ -145,18 +145,18 @@ def convert_to_asm( out_dir = os.path.join(asm_dir, entry.name) if not (os.path.exists(out_dir)): os.mkdir(out_dir) - function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes) - if function_count == 0: - function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes) + function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes) if function_count == 0: - os.rmdir(out_dir) - logging.info('The binary {} was not disassembled'.format(entry.name)) + function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes) + if function_count == 0: + os.rmdir(out_dir) + logging.info('The binary {} was not disassembled'.format(entry.name)) + else: + binary_count += 1 + disassembled_bins.append(entry.name) else: binary_count += 1 disassembled_bins.append(entry.name) - else: - binary_count += 1 - disassembled_bins.append(entry.name) else: not_found += 1 logging.info("[Error] No such file or directory: {}".format(binary_dir)) From 8d1b419e928890b2042b9772dd76cbb9a409be13 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Wed, 25 Oct 2023 13:09:22 +0200 Subject: [PATCH 58/59] AEGIS-6406 fix function_count Correctly calculate function_count per binary, not cumulatively --- asm2vec/binary_to_asm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py index 70218d7..28b573f 100644 --- a/asm2vec/binary_to_asm.py +++ b/asm2vec/binary_to_asm.py @@ -145,9 +145,9 @@ def convert_to_asm( out_dir = os.path.join(asm_dir, entry.name) if not (os.path.exists(out_dir)): os.mkdir(out_dir) - function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes) + function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes) if function_count == 0: - function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes) + function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes) if function_count == 0: os.rmdir(out_dir) logging.info('The binary {} was not disassembled'.format(entry.name)) From 90c9f991ad015f1ef28df92fd50b5e9041157c19 Mon Sep 17 00:00:00 2001 From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com> Date: Wed, 25 Oct 2023 18:05:44 +0200 Subject: [PATCH 59/59] AEGIS-6406 add magic bytes --- asm2vec/binary_to_asm.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py index 28b573f..1da1389 100644 --- a/asm2vec/binary_to_asm.py +++ b/asm2vec/binary_to_asm.py @@ -121,15 +121,16 @@ def convert_to_asm( :param minlen_lower: If disassembling is not possible with minlen_upper, lower the minimum number of assembly functions to minlen_lower (WHAT?) :param magic_bytes: List of valid for the specific OS/type of binary, e.g. - - 'cffaedfe' for Mach-O Little Endian (64-bit) - - 'feedfacf' for Mach-O Big Endian (64-bit) - - 'cefaedfe' for Mach-O Little Endian (32-bit) + - 'cffaedfe': for Mach-O Little Endian (64-bit) + - 'feedfacf': for Mach-O Big Endian (64-bit) + - 'cefaedfe': for Mach-O Little Endian (32-bit) - 'feedface': Mach-O Big Endian (32-bit) - - 'cafebabe' Universal Binary Big Endian + - 'cafebabe': Universal Binary Big Endian + - 'bebafeca' :return: List of sha1 of disassembled malware files """ if not magic_bytes: - magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe', 'cefaedfe', 'feedface'] + magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe', 'cefaedfe', 'feedface', 'bebafeca'] binary_dir = Path(input_path) asm_dir = Path(output_path)