From 9180e1975e54af9fffa1cdb0d4124d6b52c92b61 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Mon, 2 Oct 2023 12:37:52 +0100
Subject: [PATCH 01/59] TRIVIAL - added req files and fix bug

---
 .idea/.gitignore   |  8 ++++++++
 CODEOWNERS         | 32 ++++++++++++++++++++++++++++++++
 Dockerfile         | 13 +++++++++++++
 README.md          |  2 +-
 asm2vec/version.py |  2 ++
 catalog-info.yaml  | 15 +++++++++++++++
 setup.py           | 30 +++++++++++++++++++++++-------
 7 files changed, 94 insertions(+), 8 deletions(-)
 create mode 100644 .idea/.gitignore
 create mode 100644 CODEOWNERS
 create mode 100644 Dockerfile
 create mode 100644 asm2vec/version.py
 create mode 100644 catalog-info.yaml

diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000..446aa21
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,32 @@
+# This is a comment.
+# Each line is a file pattern followed by one or more owners.
+
+# These owners will be the default owners for everything in
+# the repo. Unless a later match takes precedence,
+# @global-owner1 and @global-owner2 will be requested for
+# review when someone opens a pull request.
+*       @wandera/datascience
+
+# Order is important; the last matching pattern takes the most
+# precedence. When someone opens a pull request that only
+# modifies JS files, only @js-owner and not the global
+# owner(s) will be requested for a review.
+# *.js    @js-owner
+
+# You can also use email addresses if you prefer. They'll be
+# used to look up users just like we do for commit author
+# emails.
+#*.go docs@example.com
+
+# The `docs/*` pattern will match files like
+# `docs/getting-started.md` but not further nested files like
+# `docs/build-app/troubleshooting.md`.
+# docs/*  docs@example.com
+
+# In this example, @octocat owns any file in an apps directory
+# anywhere in your repository.
+# apps/ @octocat
+
+# In this example, @doctocat owns any file in the `/docs`
+# directory in the root of your repository.
+# /docs/ @doctocat
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..cb6efa5
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.10.11-slim
+
+ADD . /asm2vec-pytorch
+WORKDIR asm2vec-pytorch
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    unixodbc-dev \
+    unixodbc \
+    libpq-dev && \
+    pip install -r requirements.txt && \
+    python setup.py install
+
+CMD ["/bin/sh"]
diff --git a/README.md b/README.md
index 7a2043b..c5fc4ae 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ The details of the model can be found in the original paper: [(sp'19) Asm2Vec: B
 
 ## Requirements
 
-python >= 3.6
+python >= 3.10
 
 | packages | for |
 | --- | --- |
diff --git a/asm2vec/version.py b/asm2vec/version.py
new file mode 100644
index 0000000..f8e7582
--- /dev/null
+++ b/asm2vec/version.py
@@ -0,0 +1,2 @@
+VERSION = '1.0.0'
+DEV_VERSION = '0'
diff --git a/catalog-info.yaml b/catalog-info.yaml
new file mode 100644
index 0000000..378ab88
--- /dev/null
+++ b/catalog-info.yaml
@@ -0,0 +1,15 @@
+apiVersion: backstage.io/v1alpha1
+kind: Component
+metadata:
+  name: asm2vec-pytorch
+  description: All code running ASM2VEC using PyTorch
+  labels:
+    - jira-key: DATASCI
+    - language: Python
+  annotations:
+    backstage.io/source-location: url:https://github.com/wandera/asm2vec-pytorch
+spec:
+  type: service
+  lifecycle: production
+  owner: datascience
+  system: datascience
diff --git a/setup.py b/setup.py
index 62ff843..be492bc 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,30 @@
 from setuptools import setup, find_packages
 
+from asm2vec.version import VERSION
+
+
+def readme():
+    with open('README.md') as f:
+        return f.read()
+
+
+def read_requirements():
+    with open('requirements.txt') as f:
+        return [s for s in f.read().split('\n') if not ('--index-url' in s)]
+
+
 setup(
     name='asm2vec',
-    version='1.0.0',
-    description='Unofficial implementation of asm2vec using pytorch',
-    install_requires=['torch>=1.7,<2'
-                      'click>=7.1,<8'
-                      'r2pipe>=1.5,<2'],
-    author='oalieno',
-    author_email='jeffrey6910@gmail.com',
+    version=VERSION,
+    description="Jamf's implementation of asm2vec using pytorch",
+    long_description=readme(),
+    author='oalieno/jamf',
+    author_email='jamie.nutter@jamf.com',
     license='MIT License',
+    install_requires=read_requirements(),
     packages = find_packages(),
+    zip_safe=False,
+    include_package_data=True,
+    test_suite='nose.collector',
+    tests_require=['nose']
 )

From 9b8decbcad77cf9230348c7c44fe0f0d43d1f640 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Mon, 2 Oct 2023 13:10:16 +0100
Subject: [PATCH 02/59] Create SECURITY.md

---
 SECURITY.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 SECURITY.md

diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..c478391
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,26 @@
+Thanks for helping make GitHub safe for everyone.
+
+# Security
+
+Jamf takes the security of our software products and services seriously, including all of the open source code repositories managed through our GitHub organizations, such as asm2vec-pytorch.
+
+We will ensure that your finding gets passed along to the appropriate maintainers for remediation.
+
+# Reporting Security Issues
+
+If you believe you have found a security vulnerability in any Jamf-owned repository, please report it to us through coordinated disclosure.
+
+Please do not report security vulnerabilities through public GitHub issues, discussions, or pull requests.
+
+Instead, please send an email to info[@]jamf.com.
+
+Please include as much of the information listed below as you can to help us better understand and resolve the issue:
+- The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
+- Full paths of source file(s) related to the manifestation of the issue 
+- The location of the affected source code (tag/branch/commit or direct URL)
+- Any special configuration required to reproduce the issue 
+- Step-by-step instructions to reproduce the issue 
+- Proof-of-concept or exploit code (if possible)
+- Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.

From 7e659f69de9b16a863e19621e2c76ea84b87e437 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Mon, 2 Oct 2023 13:40:32 +0100
Subject: [PATCH 03/59] TRIVIAL - init

---
 asm2vec/__init__.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py
index 0962ef8..ae7efea 100644
--- a/asm2vec/__init__.py
+++ b/asm2vec/__init__.py
@@ -1,6 +1 @@
-import importlib
-
-__all__ = ['model', 'datatype', 'utils']
-
-for module in __all__:
-    importlib.import_module(f'.{module}', 'asm2vec')
+__all__ = ["datatype", "model", "utils", "version"]

From 20df9cceb0c491150764b1096fd0c44cd134745d Mon Sep 17 00:00:00 2001
From: "CI2.0" <ci@wandera.co.uk>
Date: Mon, 2 Oct 2023 12:43:46 +0000
Subject: [PATCH 04/59] [Jenkins] Set version to 1.0.1

---
 asm2vec/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/version.py b/asm2vec/version.py
index f8e7582..f1ae280 100644
--- a/asm2vec/version.py
+++ b/asm2vec/version.py
@@ -1,2 +1,2 @@
-VERSION = '1.0.0'
+VERSION = '1.0.1'
 DEV_VERSION = '0'

From 5be2ef8eaaaeff22a2a07e84e4821ffdbd8e71c3 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 15:29:39 +0200
Subject: [PATCH 05/59] AEGIS-6405 datatype PEP8

---
 asm2vec/datatype.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/asm2vec/datatype.py b/asm2vec/datatype.py
index a3cd39b..b6451d8 100644
--- a/asm2vec/datatype.py
+++ b/asm2vec/datatype.py
@@ -2,19 +2,23 @@
 import random
 import warnings
 
+
 class Token:
     def __init__(self, name, index):
         self.name = name
         self.index = index
         self.count = 1
+
     def __str__(self):
         return self.name
 
+
 class Tokens:
     def __init__(self, name_to_index=None, tokens=None):
         self.name_to_index = name_to_index or {}
         self.tokens = tokens or []
         self._weights = None
+
     def __getitem__(self, key):
         if type(key) is str:
             if self.name_to_index.get(key) is None:
@@ -28,13 +32,17 @@ def __getitem__(self, key):
                 return [self[k] for k in key]
             except:
                 raise ValueError
+
     def load_state_dict(self, sd):
         self.name_to_index = sd['name_to_index']
         self.tokens = sd['tokens']
+
     def state_dict(self):
         return {'name_to_index': self.name_to_index, 'tokens': self.tokens}
+
     def size(self):
         return len(self.tokens)
+
     def add(self, names):
         self._weights = None
         if type(names) is not list:
@@ -46,6 +54,7 @@ def add(self, names):
                 self.tokens.append(token)
             else:
                 self.tokens[self.name_to_index[name]].count += 1
+
     def update(self, tokens_new):
         for token in tokens_new:
             if token.name not in self.name_to_index:
@@ -54,6 +63,7 @@ def update(self, tokens_new):
                 self.tokens.append(token)
             else:
                 self.tokens[self.name_to_index[token.name]].count += token.count
+
     def weights(self):
         # if no cache, calculate
         if self._weights is None:
@@ -62,19 +72,22 @@ def weights(self):
             for token in self.tokens:
                 self._weights[token.index] = (token.count / total) ** 0.75
         return self._weights
+
     def sample(self, batch_size, num=5):
         return torch.multinomial(self.weights(), num * batch_size, replacement=True).view(batch_size, num)
 
+
 class Function:
     def __init__(self, insts, blocks, meta):
         self.insts = insts
         self.blocks = blocks
         self.meta = meta
+
     @classmethod
     def load(cls, text):
-        '''
-        gcc -S format compatiable
-        '''
+        """gcc -S format compatible
+        """
+
         label, labels, insts, blocks, meta = None, {}, [], [], {}
         for line in text.strip('\n').split('\n'):
             if line[0] in [' ', '\t']:
@@ -109,10 +122,13 @@ def load(cls, text):
                 if labels.get(arg):
                     inst.args[i] = 'CONST'
         return cls(insts, blocks, meta)
+
     def tokens(self):
         return [token for inst in self.insts for token in inst.tokens()]
+
     def random_walk(self, num=3):
         return [self._random_walk() for _ in range(num)]
+
     def _random_walk(self):
         current, visited, seq = self.blocks[0], [], []
         while current not in visited:
@@ -124,25 +140,31 @@ def _random_walk(self):
             current = random.choice(list(current.successors))
         return seq
 
+
 class BasicBlock:
     def __init__(self):
         self.insts = []
         self.successors = set()
+
     def add(self, inst):
         self.insts.append(inst)
+
     def end(self):
         inst = self.insts[-1]
         return inst.is_jmp() or inst.op == 'ret'
 
+
 class Instruction:
     def __init__(self, op, args):
         self.op = op
         self.args = args
+
     def __str__(self):
         return f'{self.op} {", ".join([str(arg) for arg in self.args if str(arg)])}'
+
     @classmethod
     def load(cls, text):
-        text = text.strip().strip('bnd').strip() # get rid of BND prefix
+        text = text.strip().strip('bnd').strip()
         op, _, args = text.strip().partition(' ')
         if args:
             args = [arg.strip() for arg in args.split(',')]
@@ -150,9 +172,12 @@ def load(cls, text):
             args = []
         args = (args + ['', ''])[:2]
         return cls(op, args)
+
     def tokens(self):
         return [self.op] + self.args
+
     def is_jmp(self):
         return 'jmp' in self.op or self.op[0] == 'j'
+
     def is_call(self):
         return self.op == 'call'

From 2833c60d6c1323bf0bebba0e99ee39282dc3204a Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 15:30:43 +0200
Subject: [PATCH 06/59] AEGIS-6405 PEP8 model.py

---
 asm2vec/model.py | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/asm2vec/model.py b/asm2vec/model.py
index 301f3be..74a6ace 100644
--- a/asm2vec/model.py
+++ b/asm2vec/model.py
@@ -3,35 +3,43 @@
 
 bce, sigmoid, softmax = nn.BCELoss(), nn.Sigmoid(), nn.Softmax(dim=1)
 
+
 class ASM2VEC(nn.Module):
     def __init__(self, vocab_size, function_size, embedding_size):
         super(ASM2VEC, self).__init__()
-        self.embeddings   = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size))
-        self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size, _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2)
-        self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size, _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2)
+        self.embeddings = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size))
+        self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size,
+                                         _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2)
+        self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size,
+                                         _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2)
 
     def update(self, function_size_new, vocab_size_new):
         device = self.embeddings.weight.device
-        vocab_size, function_size, embedding_size = self.embeddings.num_embeddings, self.embeddings_f.num_embeddings, self.embeddings.embedding_dim
+        vocab_size, function_size, embedding_size = (self.embeddings.num_embeddings,
+                                                     self.embeddings_f.num_embeddings, self.embeddings.embedding_dim)
         if vocab_size_new != vocab_size:
-            weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size).to(device)])
+            weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size).
+                               to(device)])
             self.embeddings = nn.Embedding(vocab_size_new, embedding_size, _weight=weight)
-            weight_r = torch.cat([self.embeddings_r.weight, ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2).to(device)])
+            weight_r = torch.cat([self.embeddings_r.weight,
+                                  ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2)
+                                 .to(device)])
             self.embeddings_r = nn.Embedding(vocab_size_new, 2 * embedding_size, _weight=weight_r)
-        self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size, _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5)/embedding_size/2).to(device))
+        self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size,
+                                         _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5) /
+                                                  embedding_size/2).to(device))
 
     def v(self, inp):
-        e  = self.embeddings(inp[:,1:])
-        v_f = self.embeddings_f(inp[:,0])
-        v_prev = torch.cat([e[:,0], (e[:,1] + e[:,2]) / 2], dim=1)
-        v_next = torch.cat([e[:,3], (e[:,4] + e[:,5]) / 2], dim=1)
+        e = self.embeddings(inp[:, 1:])
+        v_f = self.embeddings_f(inp[:, 0])
+        v_prev = torch.cat([e[:, 0], (e[:, 1] + e[:, 2]) / 2], dim=1)
+        v_next = torch.cat([e[:, 3], (e[:, 4] + e[:, 5]) / 2], dim=1)
         v = ((v_f + v_prev + v_next) / 3).unsqueeze(2)
         return v
 
     def forward(self, inp, pos, neg):
         device, batch_size = inp.device, inp.shape[0]
         v = self.v(inp)
-        # negative sampling loss
         pred = torch.bmm(self.embeddings_r(torch.cat([pos, neg], dim=1)), v).squeeze()
         label = torch.cat([torch.ones(batch_size, 3), torch.zeros(batch_size, neg.shape[1])], dim=1).to(device)
         return bce(sigmoid(pred), label)
@@ -39,5 +47,6 @@ def forward(self, inp, pos, neg):
     def predict(self, inp, pos):
         device, batch_size = inp.device, inp.shape[0]
         v = self.v(inp)
-        probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1).to(device)), v).squeeze(dim=2)
+        probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1).
+                                            to(device)), v).squeeze(dim=2)
         return softmax(probs)

From 5d0353408a7c1bf856da24c4a2021cd5a4818bd0 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 15:31:34 +0200
Subject: [PATCH 07/59] AEGIS-6405 PEP8 utils.py

---
 asm2vec/utils.py | 45 ++++++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/asm2vec/utils.py b/asm2vec/utils.py
index 4f9aa25..6c5b539 100644
--- a/asm2vec/utils.py
+++ b/asm2vec/utils.py
@@ -3,18 +3,22 @@
 import torch
 from torch.utils.data import DataLoader, Dataset
 from pathlib import Path
-from .datatype import Tokens, Function, Instruction
-from .model import ASM2VEC
+from datatype import Tokens, Function, Instruction
+from model import ASM2VEC
+
 
 class AsmDataset(Dataset):
     def __init__(self, x, y):
         self.x = x
         self.y = y
+
     def __len__(self):
         return len(self.x)
+
     def __getitem__(self, index):
         return self.x[index], self.y[index]
 
+
 def load_data(paths, limit=None):
     if type(paths) is not list:
         paths = [paths]
@@ -22,7 +26,8 @@ def load_data(paths, limit=None):
     filenames = []
     for path in paths:
         if os.path.isdir(path):
-            filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) if os.path.isfile(Path(path) / filename)]
+            filenames += [Path(path) / filename for filename in sorted(os.listdir(path))
+                          if os.path.isfile(Path(path) / filename)]
         else:
             filenames += [Path(path)]
     
@@ -37,6 +42,7 @@ def load_data(paths, limit=None):
     
     return functions, tokens
 
+
 def preprocess(functions, tokens):
     x, y = [], []
     for i, fn in enumerate(functions):
@@ -46,6 +52,7 @@ def preprocess(functions, tokens):
                 y.append([tokens[token].index for token in seq[j].tokens()])
     return torch.tensor(x), torch.tensor(y)
 
+
 def train(
     functions,
     tokens,
@@ -102,6 +109,7 @@ def train(
 
     return model
 
+
 def save_model(path, model, tokens):
     torch.save({
         'model_params': (
@@ -113,6 +121,7 @@ def save_model(path, model, tokens):
         'tokens': tokens.state_dict(),
     }, path)
 
+
 def load_model(path, device='cpu'):
     checkpoint = torch.load(path, map_location=device)
     tokens = Tokens()
@@ -122,35 +131,37 @@ def load_model(path, device='cpu'):
     model = model.to(device)
     return model, tokens
 
+
 def show_probs(x, y, probs, tokens, limit=None, pretty=False):
     if pretty:
-        TL, TR, BL, BR = '┌', '┐', '└', '┘'
-        LM, RM, TM, BM = '├', '┤', '┬', '┴'
-        H, V = '─', '│'
+        tl, tr, bl, br = '┌', '┐', '└', '┘'
+        lm, rm, tm, bm = '├', '┤', '┬', '┴'
+        h, v = '─', '│'
         arrow = ' ➔'
     else:
-        TL = TR = BL = BR = '+'
-        LM = RM = TM = BM = '+'
-        H, V = '-', '|'
+        tl, tr, bl, br = '+', '+', '+', '+'
+        lm, rm, tm, bm = '+', '+', '+', '+'
+        h, v = '-', '|'
         arrow = '->'
     top = probs.topk(5)
     for i, (xi, yi) in enumerate(zip(x, y)):
         if limit and i >= limit:
             break
         xi, yi = xi.tolist(), yi.tolist()
-        print(TL + H * 42 + TR)
-        print(f'{V}    {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {V}')
-        print(f'{V} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {V}')
-        print(f'{V}    {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {V}')
-        print(LM + H * 8 + TM + H * 33 + RM)
+        print(tl + h * 42 + tr)
+        print(f'{v}    {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}')
+        print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}')
+        print(f'{v}    {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}')
+        print(lm + h * 8 + tm + h * 33 + rm)
         for value, index in zip(top.values[i], top.indices[i]):
             if index in yi:
                 colorbegin, colorclear = '\033[92m', '\033[0m'
             else:
                 colorbegin, colorclear = '', ''
-            print(f'{V} {colorbegin}{value*100:05.2f}%{colorclear} {V} {colorbegin}{tokens[index.item()].name:31}{colorclear} {V}')
-        print(BL + H * 8 + BM + H * 33 + BR)
+            print(f'{v} {colorbegin}{value*100:05.2f}%{colorclear} {v} {colorbegin}'
+                  f'{tokens[index.item()].name:31}{colorclear} {v}')
+        print(bl + h * 8 + bm + h * 33 + br)
+
 
 def accuracy(y, probs):
     return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)]))
-

From 62bafc3b6bce937124c0223465ab66071253f64c Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 16:33:45 +0200
Subject: [PATCH 08/59] AEGIS-6405 Create binary_to_assembly.py

---
 asm2vec/disassembling.py | 148 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 asm2vec/disassembling.py

diff --git a/asm2vec/disassembling.py b/asm2vec/disassembling.py
new file mode 100644
index 0000000..87158c5
--- /dev/null
+++ b/asm2vec/disassembling.py
@@ -0,0 +1,148 @@
+import re
+import os
+import hashlib
+import r2pipe
+import logging
+from pathlib import Path
+
+
+class BinaryToAsm:
+
+    def __init__(self, input_path: str, output_path: str) -> None:
+        """Disassembles the newly collected malware files
+        :param input_path: the path to the malware binaries
+        :param output_path: the path for the assembly functions to be extracted
+        """
+        self.binary_dir = Path(input_path)
+        self.asm_dir = Path(output_path)
+
+    @staticmethod
+    def _sha3(asm: str) -> str:
+        """Produces SHA3 for each assembly function
+        :param asm: input assembly function
+        """
+        return hashlib.sha3_256(asm.encode()).hexdigest()
+
+    @staticmethod
+    def _valid_exe(filename: str) -> bool:
+        """Extracts magic bytes and returns the header
+        :param filename: name of the malware file (SHA1)
+        :return: Boolean of the header existing in magic bytes
+        """
+        magics = [bytes.fromhex('cffaedfe')]
+        with open(filename, 'rb') as f:
+            header = f.read(4)
+            return header in magics
+
+    @staticmethod
+    def _normalize(opcode: str) -> str:
+        """ Normalizes the input string
+        :param opcode: opcode of the binary
+        """
+        opcode = opcode.replace(' - ', ' + ')
+        opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
+        opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
+        opcode = re.sub(r' [0-9]', ' CONST', opcode)
+        return opcode
+
+    def _fn_to_asm(self, pdf: dict | None, asm_minlen: int) -> str:
+        """Converts functions to assembly code
+        :param pdf: disassembly
+        :param asm_minlen: minimum length of assembly functions to be extracted
+        """
+        if pdf is None:
+            return ''
+        if len(pdf['ops']) < asm_minlen:
+            return ''
+        if 'invalid' in [op['type'] for op in pdf['ops']]:
+            return ''
+
+        ops = pdf['ops']
+
+        labels, scope = {}, [op['offset'] for op in ops]
+        assert (None not in scope)
+        for i, op in enumerate(ops):
+            if op.get('jump') in scope:
+                labels.setdefault(op.get('jump'), i)
+
+        output = ''
+        for op in ops:
+            if labels.get(op.get('offset')) is not None:
+                output += f'LABEL{labels[op["offset"]]}:\n'
+            if labels.get(op.get('jump')) is not None:
+                output += f' {op["type"]} LABEL{labels[op["jump"]]}\n'
+            else:
+                output += f' {self._normalize(op["opcode"])}\n'
+
+        return output
+
+    def bin_to_asm(self, filename: Path, output_path: Path, asm_minlen: int) -> int:
+        """Fragments the input binary into assembly functions via r2pipe
+        :param filename: name of the malware file  (SHA1)
+        :param output_path: path to the folder to store the assembly functions for each malware
+        :param asm_minlen: the minimum length of assembly functions to be extracted
+        :return: the number of assembly functions
+        """
+        if not self._valid_exe(filename):
+            logging.info('The input file is invalid.')
+            return 0
+
+        r = r2pipe.open(str(filename))
+        r.cmd('aaaa')
+
+        count = 0
+
+        for fn in r.cmdj('aflj'):
+            r.cmd(f's {fn["offset"]}')
+            asm = self._fn_to_asm(r.cmdj('pdfj'), asm_minlen)
+            if asm:
+                uid = self._sha3(asm)
+                asm = f''' .name {fn["name"]}\
+                .offset {fn["offset"]:016x}\
+                .file {filename.name}''' + asm
+                output_asm = os.path.join(output_path, uid)
+                with open(output_asm, 'w') as file:
+                    file.write(asm)
+                    count += 1
+        return count
+
+    def convert_to_asm(self, minlen_upper: int, minlen_lower: int) -> list:
+        """ Extracts assembly functions from malware files and saves them
+        into separate folder per binary
+        :param minlen_upper: The minimum number of assembly functions needed for disassembling
+        :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number
+        of assembly functions to minlen_lower
+        :return: List of sha1 of disassembled malware files
+        """
+
+        if not os.path.exists(self.asm_dir):
+            os.mkdir(self.asm_dir)
+
+        function_count, binary_count, not_found = 0, 0, 0
+        disassembled_bins = []
+
+        if os.path.isdir(self.binary_dir):
+            for entry in os.scandir(self.binary_dir):
+                out_dir = os.path.join(self.asm_dir, entry.name)
+                if not (os.path.exists(out_dir)):
+                    os.mkdir(out_dir)
+                function_count += self.bin_to_asm(Path(entry), Path(out_dir), minlen_upper)
+                if function_count == 0:
+                    function_count += self.bin_to_asm(Path(entry), Path(out_dir), minlen_lower)
+                    if function_count == 0:
+                        os.rmdir(out_dir)
+                        logging.info('The binary {} was not disassembled'.format(entry.name))
+                    else:
+                        binary_count += 1
+                        disassembled_bins.append(entry.name)
+                else:
+                    binary_count += 1
+                    disassembled_bins.append(entry.name)
+        else:
+            not_found += 1
+            logging.info("[Error] No such file or directory: {}".format(self.binary_dir))
+
+        logging.info("Total scanned binaries: {}".format(binary_count))
+        logging.info("Not converted binaries: {}".format(not_found))
+
+        return disassembled_bins

From 5dea44335b0ca4ae9b1359fee005806e385e0500 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 16:35:25 +0200
Subject: [PATCH 09/59] AEGIS-6405 Delete scripts/bin2asm.py

scripts/bin2asm.py to be replaced with asm2vec/binary_to_asm.py
---
 scripts/bin2asm.py | 117 ---------------------------------------------
 1 file changed, 117 deletions(-)
 delete mode 100644 scripts/bin2asm.py

diff --git a/scripts/bin2asm.py b/scripts/bin2asm.py
deleted file mode 100644
index 2134e8c..0000000
--- a/scripts/bin2asm.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python3
-import re
-import os
-import click
-import r2pipe
-import hashlib
-from pathlib import Path
-
-def sha3(data):
-    return hashlib.sha3_256(data.encode()).hexdigest()
-
-def validEXE(filename):
-    magics = [bytes.fromhex('7f454c46')]
-    with open(filename, 'rb') as f:
-        header = f.read(4)
-        return header in magics
-
-def normalize(opcode):
-    opcode = opcode.replace(' - ', ' + ')
-    opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
-    opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
-    opcode = re.sub(r' [0-9]', ' CONST', opcode)
-    return opcode
-
-def fn2asm(pdf, minlen):
-    # check
-    if pdf is None:
-        return
-    if len(pdf['ops']) < minlen:
-        return
-    if 'invalid' in [op['type'] for op in pdf['ops']]:
-        return
-
-    ops = pdf['ops']
-
-    # set label
-    labels, scope = {}, [op['offset'] for op in ops]
-    assert(None not in scope)
-    for i, op in enumerate(ops):
-        if op.get('jump') in scope:
-            labels.setdefault(op.get('jump'), i)
-    
-    # dump output
-    output = ''
-    for op in ops:
-        # add label
-        if labels.get(op.get('offset')) is not None:
-            output += f'LABEL{labels[op["offset"]]}:\n'
-        # add instruction
-        if labels.get(op.get('jump')) is not None:
-            output += f' {op["type"]} LABEL{labels[op["jump"]]}\n'
-        else:
-            output += f' {normalize(op["opcode"])}\n'
-
-    return output
-
-def bin2asm(filename, opath, minlen):
-    # check
-    if not validEXE(filename):
-        return 0
-    
-    r = r2pipe.open(str(filename))
-    r.cmd('aaaa')
-
-    count = 0
-
-    for fn in r.cmdj('aflj'):
-        r.cmd(f's {fn["offset"]}')
-        asm = fn2asm(r.cmdj('pdfj'), minlen)
-        if asm:
-            uid = sha3(asm)
-            asm = f''' .name {fn["name"]}
- .offset {fn["offset"]:016x}
- .file {filename.name}
-''' + asm
-            with open(opath / uid, 'w') as f:
-                f.write(asm)
-                count += 1
-
-    print(f'[+] {filename}')
-
-    return count
-
-@click.command()
-@click.option('-i', '--input', 'ipath', help='input directory / file', required=True)
-@click.option('-o', '--output', 'opath', default='asm', help='output directory')
-@click.option('-l', '--len', 'minlen', default=10, help='ignore assembly code with instructions amount smaller than minlen')
-def cli(ipath, opath, minlen):
-    '''
-    Extract assembly functions from binary executable
-    '''
-    ipath = Path(ipath)
-    opath = Path(opath)
-
-    # create output directory
-    if not os.path.exists(opath):
-        os.mkdir(opath)
-
-    fcount, bcount = 0, 0
-
-    # directory
-    if os.path.isdir(ipath):
-        for f in os.listdir(ipath):
-            if not os.path.islink(ipath / f) and not os.path.isdir(ipath / f):
-                fcount += bin2asm(ipath / f, opath, minlen)
-                bcount += 1
-    # file
-    elif os.path.exists(ipath):
-        fcount += bin2asm(ipath, opath, minlen)
-        bcount += 1
-    else:
-        print(f'[Error] No such file or directory: {ipath}')
-
-    print(f'[+] Total scan binary: {bcount} => Total generated assembly functions: {fcount}')
-
-if __name__ == '__main__':
-    cli()

From 988f430156afadb153324b6e5ed41ea23e02d7ff Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 16:36:53 +0200
Subject: [PATCH 10/59] AEGIS-6405 Rename disassembling.py to binary_to_asm.py

---
 asm2vec/{disassembling.py => binary_to_asm.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename asm2vec/{disassembling.py => binary_to_asm.py} (100%)

diff --git a/asm2vec/disassembling.py b/asm2vec/binary_to_asm.py
similarity index 100%
rename from asm2vec/disassembling.py
rename to asm2vec/binary_to_asm.py

From ec58db1cabddc36f6fc193a5ae01fb2007ec067b Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 17:01:09 +0200
Subject: [PATCH 11/59] AEGIS-6405 Update __init__.py

---
 asm2vec/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py
index ae7efea..d3afa2c 100644
--- a/asm2vec/__init__.py
+++ b/asm2vec/__init__.py
@@ -1 +1 @@
-__all__ = ["datatype", "model", "utils", "version"]
+__all__ = ["datatype", "model", "utils", "binary_to_asm", "version"]

From 18cd90c59df8faddca0ca537ee51e98eeb812a15 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 17:14:29 +0200
Subject: [PATCH 12/59] AEGIS-6405 Update asm2vec/utils.py - JN review

Co-authored-by: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
---
 asm2vec/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/asm2vec/utils.py b/asm2vec/utils.py
index 6c5b539..b233d33 100644
--- a/asm2vec/utils.py
+++ b/asm2vec/utils.py
@@ -3,8 +3,8 @@
 import torch
 from torch.utils.data import DataLoader, Dataset
 from pathlib import Path
-from datatype import Tokens, Function, Instruction
-from model import ASM2VEC
+from asm2vec.datatype import Tokens, Function, Instruction
+from asm2vec.model import ASM2VEC
 
 
 class AsmDataset(Dataset):

From 6632b198c84f36ed8cde12b53428ad058834bf9f Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 17:26:56 +0200
Subject: [PATCH 13/59] AEGIS-6405 remove class

---
 asm2vec/binary_to_asm.py | 265 +++++++++++++++++++--------------------
 1 file changed, 131 insertions(+), 134 deletions(-)

diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
index 87158c5..fe30d9a 100644
--- a/asm2vec/binary_to_asm.py
+++ b/asm2vec/binary_to_asm.py
@@ -6,143 +6,140 @@
 from pathlib import Path
 
 
-class BinaryToAsm:
-
-    def __init__(self, input_path: str, output_path: str) -> None:
-        """Disassembles the newly collected malware files
-        :param input_path: the path to the malware binaries
-        :param output_path: the path for the assembly functions to be extracted
-        """
-        self.binary_dir = Path(input_path)
-        self.asm_dir = Path(output_path)
-
-    @staticmethod
-    def _sha3(asm: str) -> str:
-        """Produces SHA3 for each assembly function
-        :param asm: input assembly function
-        """
-        return hashlib.sha3_256(asm.encode()).hexdigest()
-
-    @staticmethod
-    def _valid_exe(filename: str) -> bool:
-        """Extracts magic bytes and returns the header
-        :param filename: name of the malware file (SHA1)
-        :return: Boolean of the header existing in magic bytes
-        """
-        magics = [bytes.fromhex('cffaedfe')]
-        with open(filename, 'rb') as f:
-            header = f.read(4)
-            return header in magics
-
-    @staticmethod
-    def _normalize(opcode: str) -> str:
-        """ Normalizes the input string
-        :param opcode: opcode of the binary
-        """
-        opcode = opcode.replace(' - ', ' + ')
-        opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
-        opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
-        opcode = re.sub(r' [0-9]', ' CONST', opcode)
-        return opcode
-
-    def _fn_to_asm(self, pdf: dict | None, asm_minlen: int) -> str:
-        """Converts functions to assembly code
-        :param pdf: disassembly
-        :param asm_minlen: minimum length of assembly functions to be extracted
-        """
-        if pdf is None:
-            return ''
-        if len(pdf['ops']) < asm_minlen:
-            return ''
-        if 'invalid' in [op['type'] for op in pdf['ops']]:
-            return ''
-
-        ops = pdf['ops']
-
-        labels, scope = {}, [op['offset'] for op in ops]
-        assert (None not in scope)
-        for i, op in enumerate(ops):
-            if op.get('jump') in scope:
-                labels.setdefault(op.get('jump'), i)
-
-        output = ''
-        for op in ops:
-            if labels.get(op.get('offset')) is not None:
-                output += f'LABEL{labels[op["offset"]]}:\n'
-            if labels.get(op.get('jump')) is not None:
-                output += f' {op["type"]} LABEL{labels[op["jump"]]}\n'
-            else:
-                output += f' {self._normalize(op["opcode"])}\n'
-
-        return output
-
-    def bin_to_asm(self, filename: Path, output_path: Path, asm_minlen: int) -> int:
-        """Fragments the input binary into assembly functions via r2pipe
-        :param filename: name of the malware file  (SHA1)
-        :param output_path: path to the folder to store the assembly functions for each malware
-        :param asm_minlen: the minimum length of assembly functions to be extracted
-        :return: the number of assembly functions
-        """
-        if not self._valid_exe(filename):
-            logging.info('The input file is invalid.')
-            return 0
-
-        r = r2pipe.open(str(filename))
-        r.cmd('aaaa')
-
-        count = 0
-
-        for fn in r.cmdj('aflj'):
-            r.cmd(f's {fn["offset"]}')
-            asm = self._fn_to_asm(r.cmdj('pdfj'), asm_minlen)
-            if asm:
-                uid = self._sha3(asm)
-                asm = f''' .name {fn["name"]}\
-                .offset {fn["offset"]:016x}\
-                .file {filename.name}''' + asm
-                output_asm = os.path.join(output_path, uid)
-                with open(output_asm, 'w') as file:
-                    file.write(asm)
-                    count += 1
-        return count
-
-    def convert_to_asm(self, minlen_upper: int, minlen_lower: int) -> list:
-        """ Extracts assembly functions from malware files and saves them
-        into separate folder per binary
-        :param minlen_upper: The minimum number of assembly functions needed for disassembling
-        :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number
-        of assembly functions to minlen_lower
-        :return: List of sha1 of disassembled malware files
-        """
-
-        if not os.path.exists(self.asm_dir):
-            os.mkdir(self.asm_dir)
-
-        function_count, binary_count, not_found = 0, 0, 0
-        disassembled_bins = []
-
-        if os.path.isdir(self.binary_dir):
-            for entry in os.scandir(self.binary_dir):
-                out_dir = os.path.join(self.asm_dir, entry.name)
-                if not (os.path.exists(out_dir)):
-                    os.mkdir(out_dir)
-                function_count += self.bin_to_asm(Path(entry), Path(out_dir), minlen_upper)
+def _sha3(asm: str) -> str:
+    """Produces SHA3 for each assembly function
+    :param asm: input assembly function
+    """
+    return hashlib.sha3_256(asm.encode()).hexdigest()
+
+
+def _valid_exe(filename: str) -> bool:
+    """Extracts magic bytes and returns the header
+    :param filename: name of the malware file (SHA1)
+    :return: Boolean of the header existing in magic bytes
+    """
+    magics = [bytes.fromhex('cffaedfe')]
+    with open(filename, 'rb') as f:
+        header = f.read(4)
+        return header in magics
+
+
+def _normalize(opcode: str) -> str:
+    """ Normalizes the input string
+    :param opcode: opcode of the binary
+    """
+    opcode = opcode.replace(' - ', ' + ')
+    opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
+    opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
+    opcode = re.sub(r' [0-9]', ' CONST', opcode)
+    return opcode
+
+
+def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str:
+    """Converts functions to assembly code
+    :param pdf: disassembly
+    :param asm_minlen: minimum length of assembly functions to be extracted
+    """
+    if pdf is None:
+        return ''
+    if len(pdf['ops']) < asm_minlen:
+        return ''
+    if 'invalid' in [op['type'] for op in pdf['ops']]:
+        return ''
+
+    ops = pdf['ops']
+
+    labels, scope = {}, [op['offset'] for op in ops]
+    assert (None not in scope)
+    for i, op in enumerate(ops):
+        if op.get('jump') in scope:
+            labels.setdefault(op.get('jump'), i)
+
+    output = ''
+    for op in ops:
+        if labels.get(op.get('offset')) is not None:
+            output += f'LABEL{labels[op["offset"]]}:\n'
+        if labels.get(op.get('jump')) is not None:
+            output += f' {op["type"]} LABEL{labels[op["jump"]]}\n'
+        else:
+            output += f' {_normalize(op["opcode"])}\n'
+
+    return output
+
+
+def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int) -> int:
+    """Fragments the input binary into assembly functions via r2pipe
+    :param filename: name of the malware file  (SHA1)
+    :param output_path: path to the folder to store the assembly functions for each malware
+    :param asm_minlen: the minimum length of assembly functions to be extracted
+    :return: the number of assembly functions
+    """
+    if not _valid_exe(filename):
+        logging.info('The input file is invalid.')
+        return 0
+
+    r = r2pipe.open(str(filename))
+    r.cmd('aaaa')
+
+    count = 0
+
+    for fn in r.cmdj('aflj'):
+        r.cmd(f's {fn["offset"]}')
+        asm = _fn_to_asm(r.cmdj('pdfj'), asm_minlen)
+        if asm:
+            uid = _sha3(asm)
+            asm = f''' .name {fn["name"]}\
+            .offset {fn["offset"]:016x}\
+            .file {filename.name}''' + asm
+            output_asm = os.path.join(output_path, uid)
+            with open(output_asm, 'w') as file:
+                file.write(asm)
+                count += 1
+    return count
+
+
+def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int) -> list:
+    """ Extracts assembly functions from malware files and saves them
+    into separate folder per binary
+    :param input_path: the path to the malware binaries
+    :param output_path: the path for the assembly functions to be extracted
+    :param minlen_upper: The minimum number of assembly functions needed for disassembling
+    :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number
+    of assembly functions to minlen_lower
+    :return: List of sha1 of disassembled malware files
+    """
+
+    binary_dir = Path(input_path)
+    asm_dir = Path(output_path)
+
+    if not os.path.exists(asm_dir):
+        os.mkdir(asm_dir)
+
+    function_count, binary_count, not_found = 0, 0, 0
+    disassembled_bins = []
+
+    if os.path.isdir(binary_dir):
+        for entry in os.scandir(binary_dir):
+            out_dir = os.path.join(asm_dir, entry.name)
+            if not (os.path.exists(out_dir)):
+                os.mkdir(out_dir)
+            function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper)
+            if function_count == 0:
+                function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower)
                 if function_count == 0:
-                    function_count += self.bin_to_asm(Path(entry), Path(out_dir), minlen_lower)
-                    if function_count == 0:
-                        os.rmdir(out_dir)
-                        logging.info('The binary {} was not disassembled'.format(entry.name))
-                    else:
-                        binary_count += 1
-                        disassembled_bins.append(entry.name)
+                    os.rmdir(out_dir)
+                    logging.info('The binary {} was not disassembled'.format(entry.name))
                 else:
                     binary_count += 1
                     disassembled_bins.append(entry.name)
-        else:
-            not_found += 1
-            logging.info("[Error] No such file or directory: {}".format(self.binary_dir))
+            else:
+                binary_count += 1
+                disassembled_bins.append(entry.name)
+    else:
+        not_found += 1
+        logging.info("[Error] No such file or directory: {}".format(binary_dir))
 
-        logging.info("Total scanned binaries: {}".format(binary_count))
-        logging.info("Not converted binaries: {}".format(not_found))
+    logging.info("Total scanned binaries: {}".format(binary_count))
+    logging.info("Not converted binaries: {}".format(not_found))
 
-        return disassembled_bins
+    return disassembled_bins

From 98f9868be39f6e03bb300d2a44edbd67231e17d9 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 23:57:00 +0200
Subject: [PATCH 14/59] AEGIS-6406 Create train.py

Script for training an asm2vec model
---
 asm2vec/train.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++++
 scripts/train.py | 52 --------------------------------
 2 files changed, 77 insertions(+), 52 deletions(-)
 create mode 100644 asm2vec/train.py
 delete mode 100644 scripts/train.py

diff --git a/asm2vec/train.py b/asm2vec/train.py
new file mode 100644
index 0000000..32c11c0
--- /dev/null
+++ b/asm2vec/train.py
@@ -0,0 +1,77 @@
+import torch
+import asm2vec
+import logging
+from pathlib import Path
+from asm2vec import utils
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def callback(context) -> None:
+    """Prettifies the display of accuracy, if chosen
+    """
+    progress = f'{context["epoch"]} | time = {context["time"]:.2f},\
+                  loss = {context["loss"]:.4f}'
+
+    if context["accuracy"]:
+        progress += f', accuracy = {context["accuracy"]:.4f}'
+    logging.info(f"{progress}")
+
+
+def train_asm2vec_model(
+        train_set: str,
+        new_model: str,
+        model_path: str | None,
+        limit: int,
+        epochs: int,
+        calc_acc: False,
+        embedding_size=100,
+        batch_size=1024,
+        neg_sample=25,
+        lr=0.02,
+        device='cpu',
+) -> None:
+    """Trains an asm2vec model
+    :param train_set: path to the training dataset
+    :param new_model: path to the model to be trained
+    :param model_path: path to already trained model
+    :param limit: number of the assembly functions that the model will be trained on;
+    if not defined, all the assembly functions in train_set_path
+    :param epochs: number of epochs
+    :param calc_acc: displays the accuracy per training epoch; setting it to True will slow down the training
+    :param embedding_size: size of the vector representation for a token; an assembly function
+    will be represented with a vector twice that size
+    :param batch_size: the size of batches for training
+    :param neg_sample: negative sampling amount
+    :param device: 'auto' | 'cuda' | 'cpu'
+    :param lr: learning rate
+    """
+
+    if device == 'auto':
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    if model_path:
+        model, tokens = asm2vec.utils.load_model(model_path, device=device)
+        functions, tokens_new = asm2vec.utils.load_data(train_set, limit=limit)
+        tokens.update(tokens_new)
+        model.update(len(functions), tokens.size())
+    else:
+        model = None
+        functions, tokens = asm2vec.utils.load_data(Path(train_set), limit=limit)
+
+    model = asm2vec.utils.train(
+        functions,
+        tokens,
+        model=model,
+        embedding_size=embedding_size,
+        batch_size=batch_size,
+        epochs=epochs,
+        neg_sample_num=neg_sample,
+        calc_acc=calc_acc,
+        device=device,
+        callback=callback,
+        learning_rate=lr
+    )
+    asm2vec.utils.save_model(new_model, model, tokens)
+
+    return None
diff --git a/scripts/train.py b/scripts/train.py
deleted file mode 100644
index 98391f4..0000000
--- a/scripts/train.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import torch
-import click
-import asm2vec
-
-@click.command()
-@click.option('-i', '--input', 'ipath', help='training data folder', required=True)
-@click.option('-o', '--output', 'opath', default='model.pt', help='output model path', show_default=True)
-@click.option('-m', '--model', 'mpath', help='load previous trained model path', type=str)
-@click.option('-l', '--limit', help='limit the number of functions to be loaded', show_default=True, type=int)
-@click.option('-d', '--ebedding-dimension', 'embedding_size', default=100, help='embedding dimension', show_default=True)
-@click.option('-b', '--batch-size', 'batch_size', default=1024, help='batch size', show_default=True)
-@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
-@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True)
-@click.option('-a', '--calculate-accuracy', 'calc_acc', help='whether calculate accuracy ( will be significantly slower )', is_flag=True)
-@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
-@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
-def cli(ipath, opath, mpath, limit, embedding_size, batch_size, epochs, neg_sample_num, calc_acc, device, lr):
-    if device == 'auto':
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    
-    if mpath:
-        model, tokens = asm2vec.utils.load_model(mpath, device=device)
-        functions, tokens_new = asm2vec.utils.load_data(ipath, limit=limit)
-        tokens.update(tokens_new)
-        model.update(len(functions), tokens.size())
-    else:
-        model = None
-        functions, tokens = asm2vec.utils.load_data(ipath, limit=limit)
-
-    def callback(context):
-        progress = f'{context["epoch"]} | time = {context["time"]:.2f}, loss = {context["loss"]:.4f}'
-        if context["accuracy"]:
-            progress += f', accuracy = {context["accuracy"]:.4f}'
-        print(progress)
-        asm2vec.utils.save_model(opath, context["model"], context["tokens"])
-
-    model = asm2vec.utils.train(
-        functions,
-        tokens,
-        model=model,
-        embedding_size=embedding_size,
-        batch_size=batch_size,
-        epochs=epochs,
-        neg_sample_num=neg_sample_num,
-        calc_acc=calc_acc,
-        device=device,
-        callback=callback,
-        learning_rate=lr
-    )
-
-if __name__ == '__main__':
-    cli()

From 74be99e44cebdd6f03cec67a56d8eef7e002b4a3 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 2 Oct 2023 23:57:45 +0200
Subject: [PATCH 15/59] AEGIS-6406 Update __init__.py

---
 asm2vec/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py
index d3afa2c..f6e961b 100644
--- a/asm2vec/__init__.py
+++ b/asm2vec/__init__.py
@@ -1 +1 @@
-__all__ = ["datatype", "model", "utils", "binary_to_asm", "version"]
+__all__ = ["datatype", "model", "utils", "binary_to_asm", "train", "version"]

From 2755c99e3e3d69d1d73bd9f9ec847a6a4686f197 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 00:20:56 +0200
Subject: [PATCH 16/59] AEGIS-6406 Update __init__.py

---
 asm2vec/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py
index f6e961b..291f06f 100644
--- a/asm2vec/__init__.py
+++ b/asm2vec/__init__.py
@@ -1 +1 @@
-__all__ = ["datatype", "model", "utils", "binary_to_asm", "train", "version"]
+__all__ = ["datatype", "model", "utils", "binary_to_asm", "train", "tensors", "version"]

From 26d492c2e81d4b19e34a642fa096dc0c5053df75 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 00:21:54 +0200
Subject: [PATCH 17/59] AEGIS-6406 Create tensors.py

Script for calculation of tensor representations
---
 asm2vec/tensors.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 asm2vec/tensors.py

diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py
new file mode 100644
index 0000000..0d419c0
--- /dev/null
+++ b/asm2vec/tensors.py
@@ -0,0 +1,69 @@
+import os
+import torch
+import logging
+import asm2vec
+from asm2vec import utils
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.02) -> list:
+    """Calculates vector representation of a binary as the mean per column
+    of the vector representations of its assembly functions
+    :param asm_path: folder with assembly function in a subfolder per binary
+    :param tensor_path: folder to store the tensors
+    :param model_path: path to the trained model
+    :param epochs: number of epochs
+    :param device:  'auto' | 'cuda' | 'cpu'
+    :param lr: learning rate
+    """
+    tensors_list = []
+    if device == 'auto':
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    if os.path.isfile(model_path):
+        model, tokens = asm2vec.utils.load_model(model_path, device=device)
+    else:
+        print("No valid model")
+        return []
+
+    dir0 = Path(tensor_path)
+    if not (os.path.exists(dir0)):
+        os.mkdir(dir0)
+
+    if os.path.isdir(asm_path):
+        obj = os.scandir(asm_path)
+        for entry in obj:
+            if entry.is_dir() and os.listdir(entry) and entry.name:
+                tensor_file = os.path.join(dir0, entry.name)
+                if not (os.path.exists(tensor_file)):
+                    functions, tokens_new = asm2vec.utils.load_data([entry])
+                    file_count = sum(len(files) for _, _, files in os.walk(entry))
+                    tokens.update(tokens_new)
+                    logging.info(f"Binary {entry.name}: {file_count} assembly functions")
+                    model.update(file_count, tokens.size())
+                    model = model.to(device)
+
+                    model = asm2vec.utils.train(
+                        functions,
+                        tokens,
+                        model=model,
+                        epochs=epochs,
+                        device=device,
+                        mode='test',
+                        learning_rate=lr
+                    )
+
+                    tensor = model.to('cpu').embeddings_f(torch.tensor([list(range(0, file_count))]))
+                    tens = torch.squeeze(tensor)
+                    if file_count == 1:
+                        torch.save(tensor, tensor_file)
+                    else:
+                        torch.save(tens.mean(0), tensor_file)
+                    tensors_list.append(entry.name)
+
+    else:
+        logging.info("No valid directory")
+
+    return tensors_list

From 36d29bb74f6931f80f3d1a9da547bb746446bc9a Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 09:45:56 +0200
Subject: [PATCH 18/59] AEGIS-6405 pass magic bytes as variable

Magic bytes as variable so that it is usable for other OS/file formats
---
 asm2vec/binary_to_asm.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
index fe30d9a..1480459 100644
--- a/asm2vec/binary_to_asm.py
+++ b/asm2vec/binary_to_asm.py
@@ -13,12 +13,13 @@ def _sha3(asm: str) -> str:
     return hashlib.sha3_256(asm.encode()).hexdigest()
 
 
-def _valid_exe(filename: str) -> bool:
+def _valid_exe(filename: str, magic_bytes) -> bool:
     """Extracts magic bytes and returns the header
     :param filename: name of the malware file (SHA1)
+    :param magic_bytes for the specific OS/type of binary
     :return: Boolean of the header existing in magic bytes
     """
-    magics = [bytes.fromhex('cffaedfe')]
+    magics = [bytes.fromhex(magic_bytes)]
     with open(filename, 'rb') as f:
         header = f.read(4)
         return header in magics
@@ -67,14 +68,15 @@ def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str:
     return output
 
 
-def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int) -> int:
+def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes) -> int:
     """Fragments the input binary into assembly functions via r2pipe
     :param filename: name of the malware file  (SHA1)
     :param output_path: path to the folder to store the assembly functions for each malware
     :param asm_minlen: the minimum length of assembly functions to be extracted
+    :param magic_bytes for the specific OS/type of binary
     :return: the number of assembly functions
     """
-    if not _valid_exe(filename):
+    if not _valid_exe(filename, magic_bytes):
         logging.info('The input file is invalid.')
         return 0
 
@@ -98,7 +100,7 @@ def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int) -> int:
     return count
 
 
-def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int) -> list:
+def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int, magic_bytes='cffaedfe') -> list:
     """ Extracts assembly functions from malware files and saves them
     into separate folder per binary
     :param input_path: the path to the malware binaries
@@ -106,6 +108,7 @@ def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int
     :param minlen_upper: The minimum number of assembly functions needed for disassembling
     :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number
     of assembly functions to minlen_lower
+    :param magic_bytes for the specific OS/type of binary
     :return: List of sha1 of disassembled malware files
     """
 
@@ -123,9 +126,9 @@ def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int
             out_dir = os.path.join(asm_dir, entry.name)
             if not (os.path.exists(out_dir)):
                 os.mkdir(out_dir)
-            function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper)
+            function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes)
             if function_count == 0:
-                function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower)
+                function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes)
                 if function_count == 0:
                     os.rmdir(out_dir)
                     logging.info('The binary {} was not disassembled'.format(entry.name))

From f47abd0befe33b62871b706b7e186cf19dbc8307 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 10:30:56 +0200
Subject: [PATCH 19/59] AEGIS-6405 fixing logging

---
 asm2vec/binary_to_asm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
index 1480459..a77b5a7 100644
--- a/asm2vec/binary_to_asm.py
+++ b/asm2vec/binary_to_asm.py
@@ -5,6 +5,8 @@
 import logging
 from pathlib import Path
 
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
 
 def _sha3(asm: str) -> str:
     """Produces SHA3 for each assembly function

From 9c3cf83023dcab680bc7cbf7006768be3157350f Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 10:34:47 +0200
Subject: [PATCH 20/59] AEGIS-6406 Update - JN review

Co-authored-by: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
---
 asm2vec/tensors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py
index 0d419c0..33027a0 100644
--- a/asm2vec/tensors.py
+++ b/asm2vec/tensors.py
@@ -23,7 +23,7 @@ def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.0
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
     if os.path.isfile(model_path):
-        model, tokens = asm2vec.utils.load_model(model_path, device=device)
+        model, tokens = utils.load_model(model_path, device=device)
     else:
         print("No valid model")
         return []

From 9d0ea0fb32a8d86da347f0218f9a20cbcac36aee Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 10:47:20 +0200
Subject: [PATCH 21/59] AEGIS-6406 fix package import, args types

---
 asm2vec/tensors.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py
index 33027a0..4319cd2 100644
--- a/asm2vec/tensors.py
+++ b/asm2vec/tensors.py
@@ -1,14 +1,18 @@
 import os
 import torch
 import logging
-import asm2vec
 from asm2vec import utils
 from pathlib import Path
 
 logging.basicConfig(level=logging.INFO, format='%(message)s')
 
 
-def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.02) -> list:
+def calc_tensors(asm_path: str,
+                 tensor_path: str,
+                 model_path: str,
+                 epochs: int,
+                 device: str = 'cpu',
+                 learning_rate: float = 0.02) -> list:
     """Calculates vector representation of a binary as the mean per column
     of the vector representations of its assembly functions
     :param asm_path: folder with assembly function in a subfolder per binary
@@ -16,7 +20,7 @@ def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.0
     :param model_path: path to the trained model
     :param epochs: number of epochs
     :param device:  'auto' | 'cuda' | 'cpu'
-    :param lr: learning rate
+    :param learning_rate: learning rate
     """
     tensors_list = []
     if device == 'auto':
@@ -38,14 +42,14 @@ def calc_tensors(asm_path, tensor_path, model_path, epochs, device='cpu', lr=0.0
             if entry.is_dir() and os.listdir(entry) and entry.name:
                 tensor_file = os.path.join(dir0, entry.name)
                 if not (os.path.exists(tensor_file)):
-                    functions, tokens_new = asm2vec.utils.load_data([entry])
+                    functions, tokens_new = utils.load_data([entry])
                     file_count = sum(len(files) for _, _, files in os.walk(entry))
                     tokens.update(tokens_new)
                     logging.info(f"Binary {entry.name}: {file_count} assembly functions")
                     model.update(file_count, tokens.size())
                     model = model.to(device)
 
-                    model = asm2vec.utils.train(
+                    model = utils.train(
                         functions,
                         tokens,
                         model=model,

From 045ea32f152c07dcdbecbe83c3b66551a4fc07bc Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 10:57:39 +0200
Subject: [PATCH 22/59] AEGIS-6406 args types, function return

---
 asm2vec/train.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/asm2vec/train.py b/asm2vec/train.py
index 32c11c0..6e6eb3c 100644
--- a/asm2vec/train.py
+++ b/asm2vec/train.py
@@ -3,6 +3,7 @@
 import logging
 from pathlib import Path
 from asm2vec import utils
+from asm2vec.model import ASM2VEC
 
 logging.basicConfig(level=logging.INFO, format='%(message)s')
 
@@ -24,13 +25,13 @@ def train_asm2vec_model(
         model_path: str | None,
         limit: int,
         epochs: int,
-        calc_acc: False,
-        embedding_size=100,
-        batch_size=1024,
-        neg_sample=25,
-        lr=0.02,
-        device='cpu',
-) -> None:
+        calc_acc: bool = False,
+        embedding_size: int = 100,
+        batch_size: int = 1024,
+        neg_sample: int = 25,
+        learning_rate: float = 0.02,
+        device: str = 'cpu'
+) -> ASM2VEC:
     """Trains an asm2vec model
     :param train_set: path to the training dataset
     :param new_model: path to the model to be trained
@@ -44,22 +45,22 @@ def train_asm2vec_model(
     :param batch_size: the size of batches for training
     :param neg_sample: negative sampling amount
     :param device: 'auto' | 'cuda' | 'cpu'
-    :param lr: learning rate
+    :param learning_rate: learning rate
     """
 
     if device == 'auto':
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
     if model_path:
-        model, tokens = asm2vec.utils.load_model(model_path, device=device)
-        functions, tokens_new = asm2vec.utils.load_data(train_set, limit=limit)
+        model, tokens = utils.load_model(model_path, device=device)
+        functions, tokens_new = utils.load_data(train_set, limit=limit)
         tokens.update(tokens_new)
         model.update(len(functions), tokens.size())
     else:
         model = None
-        functions, tokens = asm2vec.utils.load_data(Path(train_set), limit=limit)
+        functions, tokens = utils.load_data(Path(train_set), limit=limit)
 
-    model = asm2vec.utils.train(
+    model = utils.train(
         functions,
         tokens,
         model=model,
@@ -70,8 +71,8 @@ def train_asm2vec_model(
         calc_acc=calc_acc,
         device=device,
         callback=callback,
-        learning_rate=lr
+        learning_rate=learning_rate
     )
-    asm2vec.utils.save_model(new_model, model, tokens)
+    utils.save_model(new_model, model, tokens)
 
-    return None
+    return model

From 70acd348bdf3ef932ad3580554ba773f1e57fbcf Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 10:59:14 +0200
Subject: [PATCH 23/59] AEGIS-6406 remove import

---
 asm2vec/train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/asm2vec/train.py b/asm2vec/train.py
index 6e6eb3c..f161891 100644
--- a/asm2vec/train.py
+++ b/asm2vec/train.py
@@ -1,5 +1,4 @@
 import torch
-import asm2vec
 import logging
 from pathlib import Path
 from asm2vec import utils

From 8453b40afc84fadec853af6ce4045cf8e074a178 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 12:33:28 +0200
Subject: [PATCH 24/59] AEGIS-6405 magic bytes as list of strings

If none, then use the magic bytes for MacOS
---
 asm2vec/binary_to_asm.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
index a77b5a7..bfcc33b 100644
--- a/asm2vec/binary_to_asm.py
+++ b/asm2vec/binary_to_asm.py
@@ -15,13 +15,13 @@ def _sha3(asm: str) -> str:
     return hashlib.sha3_256(asm.encode()).hexdigest()
 
 
-def _valid_exe(filename: str, magic_bytes) -> bool:
+def _valid_exe(filename: str, magic_bytes: list[str]) -> bool:
     """Extracts magic bytes and returns the header
     :param filename: name of the malware file (SHA1)
     :param magic_bytes for the specific OS/type of binary
     :return: Boolean of the header existing in magic bytes
     """
-    magics = [bytes.fromhex(magic_bytes)]
+    magics = [bytes.fromhex(i) for i in magic_bytes]
     with open(filename, 'rb') as f:
         header = f.read(4)
         return header in magics
@@ -70,7 +70,7 @@ def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str:
     return output
 
 
-def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes) -> int:
+def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes: list[str]) -> int:
     """Fragments the input binary into assembly functions via r2pipe
     :param filename: name of the malware file  (SHA1)
     :param output_path: path to the folder to store the assembly functions for each malware
@@ -102,7 +102,12 @@ def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes)
     return count
 
 
-def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int, magic_bytes='cffaedfe') -> list:
+def convert_to_asm(input_path: str,
+                   output_path: str,
+                   minlen_upper: int,
+                   minlen_lower: int,
+                   magic_bytes: list[str] = None
+                   ) -> list:
     """ Extracts assembly functions from malware files and saves them
     into separate folder per binary
     :param input_path: the path to the malware binaries
@@ -110,9 +115,14 @@ def convert_to_asm(input_path, output_path, minlen_upper: int, minlen_lower: int
     :param minlen_upper: The minimum number of assembly functions needed for disassembling
     :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number
     of assembly functions to minlen_lower
-    :param magic_bytes for the specific OS/type of binary
+    :param magic_bytes: list of valid for the specific OS/type of binary; e.g.
+    'cffaedfe' for Mach-O Little Endian (64-bit)
+    'feedfacf' for Mach-O Big Endian (64-bit)
+    'cafebabe'  Universal Binary Big Endian
     :return: List of sha1 of disassembled malware files
     """
+    if not magic_bytes:
+        magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe']
 
     binary_dir = Path(input_path)
     asm_dir = Path(output_path)

From 36eda60399d83cf6b9055e351433746380eb1e9b Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 12:39:48 +0200
Subject: [PATCH 25/59] AEGIS-6405 add more magic bytes for MacOS

---
 asm2vec/binary_to_asm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
index bfcc33b..3c141d9 100644
--- a/asm2vec/binary_to_asm.py
+++ b/asm2vec/binary_to_asm.py
@@ -118,11 +118,13 @@ def convert_to_asm(input_path: str,
     :param magic_bytes: list of valid for the specific OS/type of binary; e.g.
     'cffaedfe' for Mach-O Little Endian (64-bit)
     'feedfacf' for Mach-O Big Endian (64-bit)
+    'cefaedfe' for Mach-O Little Endian (32-bit)
+    'feedface': Mach-O Big Endian (32-bit)
     'cafebabe'  Universal Binary Big Endian
     :return: List of sha1 of disassembled malware files
     """
     if not magic_bytes:
-        magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe']
+        magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe', 'cefaedfe', 'feedface']
 
     binary_dir = Path(input_path)
     asm_dir = Path(output_path)

From 96a8a00ad3cffd823b4c85c8ffae1dd48ef68452 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 14:26:35 +0200
Subject: [PATCH 26/59] AEGIS-6406 migrate utils.py to train.py

---
 asm2vec/train.py | 178 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 171 insertions(+), 7 deletions(-)

diff --git a/asm2vec/train.py b/asm2vec/train.py
index f161891..12b8fe5 100644
--- a/asm2vec/train.py
+++ b/asm2vec/train.py
@@ -1,12 +1,175 @@
+import os
+import time
 import torch
 import logging
 from pathlib import Path
-from asm2vec import utils
+from torch.utils.data import DataLoader, Dataset
 from asm2vec.model import ASM2VEC
+from asm2vec.datatype import Tokens, Function, Instruction
 
 logging.basicConfig(level=logging.INFO, format='%(message)s')
 
 
+class AsmDataset(Dataset):
+    def __init__(self, x, y):
+        self.x = x
+        self.y = y
+
+    def __len__(self):
+        return len(self.x)
+
+    def __getitem__(self, index):
+        return self.x[index], self.y[index]
+
+
+def load_data(paths, limit=None):
+    if type(paths) is not list:
+        paths = [paths]
+
+    filenames = []
+    for path in paths:
+        if os.path.isdir(path):
+            filenames += [Path(path) / filename for filename in sorted(os.listdir(path))
+                          if os.path.isfile(Path(path) / filename)]
+        else:
+            filenames += [Path(path)]
+
+    functions, tokens = [], Tokens()
+    for i, filename in enumerate(filenames):
+        if limit and i >= limit:
+            break
+        with open(filename) as f:
+            fn = Function.load(f.read())
+            functions.append(fn)
+            tokens.add(fn.tokens())
+
+    return functions, tokens
+
+
+def preprocess(functions, tokens):
+    x, y = [], []
+    for i, fn in enumerate(functions):
+        for seq in fn.random_walk():
+            for j in range(1, len(seq) - 1):
+                x.append([i] + [tokens[token].index for token in seq[j - 1].tokens() + seq[j + 1].tokens()])
+                y.append([tokens[token].index for token in seq[j].tokens()])
+    return torch.tensor(x), torch.tensor(y)
+
+
+def train(
+        functions,
+        tokens,
+        model=None,
+        embedding_size=100,
+        batch_size=1024,
+        epochs=10,
+        neg_sample_num=25,
+        calc_acc=False,
+        device='cpu',
+        mode='train',
+        callback=None,
+        learning_rate=0.02
+):
+    if mode == 'train':
+        if model is None:
+            model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device)
+        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+    elif mode == 'test':
+        if model is None:
+            raise ValueError("test mode required pretrained model")
+        optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate)
+    else:
+        raise ValueError("Unknown mode")
+
+    loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True)
+    for epoch in range(epochs):
+        start = time.time()
+        loss_sum, loss_count, accs = 0.0, 0, []
+
+        model.train()
+        for i, (inp, pos) in enumerate(loader):
+            neg = tokens.sample(inp.shape[0], neg_sample_num)
+            loss = model(inp.to(device), pos.to(device), neg.to(device))
+            loss_sum, loss_count = loss_sum + loss, loss_count + 1
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if i == 0 and calc_acc:
+                probs = model.predict(inp.to(device), pos.to(device))
+                accs.append(accuracy(pos, probs))
+
+        if callback:
+            callback({
+                'model': model,
+                'tokens': tokens,
+                'epoch': epoch,
+                'time': time.time() - start,
+                'loss': loss_sum / loss_count,
+                'accuracy': torch.tensor(accs).mean() if calc_acc else None
+            })
+
+    return model
+
+
+def save_model(path, model, tokens):
+    torch.save({
+        'model_params': (
+            model.embeddings.num_embeddings,
+            model.embeddings_f.num_embeddings,
+            model.embeddings.embedding_dim
+        ),
+        'model': model.state_dict(),
+        'tokens': tokens.state_dict(),
+    }, path)
+
+
+def load_model(path, device='cpu'):
+    checkpoint = torch.load(path, map_location=device)
+    tokens = Tokens()
+    tokens.load_state_dict(checkpoint['tokens'])
+    model = ASM2VEC(*checkpoint['model_params'])
+    model.load_state_dict(checkpoint['model'])
+    model = model.to(device)
+    return model, tokens
+
+
+def show_probs(x, y, probs, tokens, limit=None, pretty=False):
+    if pretty:
+        tl, tr, bl, br = '┌', '┐', '└', '┘'
+        lm, rm, tm, bm = '├', '┤', '┬', '┴'
+        h, v = '─', '│'
+        arrow = ' ➔'
+    else:
+        tl, tr, bl, br = '+', '+', '+', '+'
+        lm, rm, tm, bm = '+', '+', '+', '+'
+        h, v = '-', '|'
+        arrow = '->'
+    top = probs.topk(5)
+    for i, (xi, yi) in enumerate(zip(x, y)):
+        if limit and i >= limit:
+            break
+        xi, yi = xi.tolist(), yi.tolist()
+        print(tl + h * 42 + tr)
+        print(f'{v}    {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}')
+        print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}')
+        print(f'{v}    {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}')
+        print(lm + h * 8 + tm + h * 33 + rm)
+        for value, index in zip(top.values[i], top.indices[i]):
+            if index in yi:
+                colorbegin, colorclear = '\033[92m', '\033[0m'
+            else:
+                colorbegin, colorclear = '', ''
+            print(f'{v} {colorbegin}{value * 100:05.2f}%{colorclear} {v} {colorbegin}'
+                  f'{tokens[index.item()].name:31}{colorclear} {v}')
+        print(bl + h * 8 + bm + h * 33 + br)
+
+
+def accuracy(y, probs):
+    return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)]))
+
+
 def callback(context) -> None:
     """Prettifies the display of accuracy, if chosen
     """
@@ -22,8 +185,8 @@ def train_asm2vec_model(
         train_set: str,
         new_model: str,
         model_path: str | None,
-        limit: int,
         epochs: int,
+        limit: int = None,
         calc_acc: bool = False,
         embedding_size: int = 100,
         batch_size: int = 1024,
@@ -31,6 +194,7 @@ def train_asm2vec_model(
         learning_rate: float = 0.02,
         device: str = 'cpu'
 ) -> ASM2VEC:
+
     """Trains an asm2vec model
     :param train_set: path to the training dataset
     :param new_model: path to the model to be trained
@@ -51,15 +215,15 @@ def train_asm2vec_model(
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
     if model_path:
-        model, tokens = utils.load_model(model_path, device=device)
-        functions, tokens_new = utils.load_data(train_set, limit=limit)
+        model, tokens = load_model(model_path, device=device)
+        functions, tokens_new = load_data(train_set, limit=limit)
         tokens.update(tokens_new)
         model.update(len(functions), tokens.size())
     else:
         model = None
-        functions, tokens = utils.load_data(Path(train_set), limit=limit)
+        functions, tokens = load_data(Path(train_set), limit=limit)
 
-    model = utils.train(
+    model = train(
         functions,
         tokens,
         model=model,
@@ -72,6 +236,6 @@ def train_asm2vec_model(
         callback=callback,
         learning_rate=learning_rate
     )
-    utils.save_model(new_model, model, tokens)
+    save_model(new_model, model, tokens)
 
     return model

From a3bc3d08735cf4c7957b6ab856a3f40f92b3347a Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 14:27:15 +0200
Subject: [PATCH 27/59] AEGIS-6406 remove utils

---
 asm2vec/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py
index 291f06f..2f3c046 100644
--- a/asm2vec/__init__.py
+++ b/asm2vec/__init__.py
@@ -1 +1 @@
-__all__ = ["datatype", "model", "utils", "binary_to_asm", "train", "tensors", "version"]
+__all__ = ["datatype", "model", "binary_to_asm", "train", "tensors", "version"]

From b73b9393cf75101090934f1b553156cd3056cf7e Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 14:37:40 +0200
Subject: [PATCH 28/59] AEGIS-6406 fix imports to account for moving utils.py
 to train.py

---
 asm2vec/tensors.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py
index 4319cd2..01b306f 100644
--- a/asm2vec/tensors.py
+++ b/asm2vec/tensors.py
@@ -1,7 +1,7 @@
 import os
 import torch
 import logging
-from asm2vec import utils
+from asm2vec.train import train, load_model, load_data
 from pathlib import Path
 
 logging.basicConfig(level=logging.INFO, format='%(message)s')
@@ -27,7 +27,7 @@ def calc_tensors(asm_path: str,
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
     if os.path.isfile(model_path):
-        model, tokens = utils.load_model(model_path, device=device)
+        model, tokens = load_model(model_path, device=device)
     else:
         print("No valid model")
         return []
@@ -42,21 +42,21 @@ def calc_tensors(asm_path: str,
             if entry.is_dir() and os.listdir(entry) and entry.name:
                 tensor_file = os.path.join(dir0, entry.name)
                 if not (os.path.exists(tensor_file)):
-                    functions, tokens_new = utils.load_data([entry])
+                    functions, tokens_new = load_data([entry])
                     file_count = sum(len(files) for _, _, files in os.walk(entry))
                     tokens.update(tokens_new)
                     logging.info(f"Binary {entry.name}: {file_count} assembly functions")
                     model.update(file_count, tokens.size())
                     model = model.to(device)
 
-                    model = utils.train(
+                    model = train(
                         functions,
                         tokens,
                         model=model,
                         epochs=epochs,
                         device=device,
                         mode='test',
-                        learning_rate=lr
+                        learning_rate=learning_rate
                     )
 
                     tensor = model.to('cpu').embeddings_f(torch.tensor([list(range(0, file_count))]))

From bd8bcd780e9a3d8c6cdcb89ab65b7695efa04d39 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 15:23:03 +0200
Subject: [PATCH 29/59] AEGIS-6406 Delete asm2vec/utils.py

The code has been migrated to asm2vec/train.py
---
 asm2vec/utils.py | 167 -----------------------------------------------
 1 file changed, 167 deletions(-)
 delete mode 100644 asm2vec/utils.py

diff --git a/asm2vec/utils.py b/asm2vec/utils.py
deleted file mode 100644
index b233d33..0000000
--- a/asm2vec/utils.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import os
-import time
-import torch
-from torch.utils.data import DataLoader, Dataset
-from pathlib import Path
-from asm2vec.datatype import Tokens, Function, Instruction
-from asm2vec.model import ASM2VEC
-
-
-class AsmDataset(Dataset):
-    def __init__(self, x, y):
-        self.x = x
-        self.y = y
-
-    def __len__(self):
-        return len(self.x)
-
-    def __getitem__(self, index):
-        return self.x[index], self.y[index]
-
-
-def load_data(paths, limit=None):
-    if type(paths) is not list:
-        paths = [paths]
-   
-    filenames = []
-    for path in paths:
-        if os.path.isdir(path):
-            filenames += [Path(path) / filename for filename in sorted(os.listdir(path))
-                          if os.path.isfile(Path(path) / filename)]
-        else:
-            filenames += [Path(path)]
-    
-    functions, tokens = [], Tokens()
-    for i, filename in enumerate(filenames):
-        if limit and i >= limit:
-            break
-        with open(filename) as f:
-            fn = Function.load(f.read())
-            functions.append(fn)
-            tokens.add(fn.tokens())
-    
-    return functions, tokens
-
-
-def preprocess(functions, tokens):
-    x, y = [], []
-    for i, fn in enumerate(functions):
-        for seq in fn.random_walk():
-            for j in range(1, len(seq) - 1):
-                x.append([i] + [tokens[token].index for token in seq[j-1].tokens() + seq[j+1].tokens()])
-                y.append([tokens[token].index for token in seq[j].tokens()])
-    return torch.tensor(x), torch.tensor(y)
-
-
-def train(
-    functions,
-    tokens,
-    model=None,
-    embedding_size=100,
-    batch_size=1024,
-    epochs=10,
-    neg_sample_num=25,
-    calc_acc=False,
-    device='cpu',
-    mode='train',
-    callback=None,
-    learning_rate=0.02
-):
-    if mode == 'train':
-        if model is None:
-            model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device)
-        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-    elif mode == 'test':
-        if model is None:
-            raise ValueError("test mode required pretrained model")
-        optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate)
-    else:
-        raise ValueError("Unknown mode")
-
-    loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True)
-    for epoch in range(epochs):
-        start = time.time()
-        loss_sum, loss_count, accs = 0.0, 0, []
-
-        model.train()
-        for i, (inp, pos) in enumerate(loader):
-            neg = tokens.sample(inp.shape[0], neg_sample_num)
-            loss = model(inp.to(device), pos.to(device), neg.to(device))
-            loss_sum, loss_count = loss_sum + loss, loss_count + 1
-            
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-            if i == 0 and calc_acc:
-                probs = model.predict(inp.to(device), pos.to(device))
-                accs.append(accuracy(pos, probs))
-
-        if callback:
-            callback({
-                'model': model,
-                'tokens': tokens,
-                'epoch': epoch,
-                'time': time.time() - start,
-                'loss': loss_sum / loss_count,
-                'accuracy': torch.tensor(accs).mean() if calc_acc else None
-            })
-
-    return model
-
-
-def save_model(path, model, tokens):
-    torch.save({
-        'model_params': (
-            model.embeddings.num_embeddings,
-            model.embeddings_f.num_embeddings,
-            model.embeddings.embedding_dim
-        ),
-        'model': model.state_dict(),
-        'tokens': tokens.state_dict(),
-    }, path)
-
-
-def load_model(path, device='cpu'):
-    checkpoint = torch.load(path, map_location=device)
-    tokens = Tokens()
-    tokens.load_state_dict(checkpoint['tokens'])
-    model = ASM2VEC(*checkpoint['model_params'])
-    model.load_state_dict(checkpoint['model'])
-    model = model.to(device)
-    return model, tokens
-
-
-def show_probs(x, y, probs, tokens, limit=None, pretty=False):
-    if pretty:
-        tl, tr, bl, br = '┌', '┐', '└', '┘'
-        lm, rm, tm, bm = '├', '┤', '┬', '┴'
-        h, v = '─', '│'
-        arrow = ' ➔'
-    else:
-        tl, tr, bl, br = '+', '+', '+', '+'
-        lm, rm, tm, bm = '+', '+', '+', '+'
-        h, v = '-', '|'
-        arrow = '->'
-    top = probs.topk(5)
-    for i, (xi, yi) in enumerate(zip(x, y)):
-        if limit and i >= limit:
-            break
-        xi, yi = xi.tolist(), yi.tolist()
-        print(tl + h * 42 + tr)
-        print(f'{v}    {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}')
-        print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}')
-        print(f'{v}    {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}')
-        print(lm + h * 8 + tm + h * 33 + rm)
-        for value, index in zip(top.values[i], top.indices[i]):
-            if index in yi:
-                colorbegin, colorclear = '\033[92m', '\033[0m'
-            else:
-                colorbegin, colorclear = '', ''
-            print(f'{v} {colorbegin}{value*100:05.2f}%{colorclear} {v} {colorbegin}'
-                  f'{tokens[index.item()].name:31}{colorclear} {v}')
-        print(bl + h * 8 + bm + h * 33 + br)
-
-
-def accuracy(y, probs):
-    return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)]))

From 91bfc9061ee3d72eec824d0cf3f8d55dacb7d31b Mon Sep 17 00:00:00 2001
From: "CI2.0" <ci@wandera.co.uk>
Date: Tue, 3 Oct 2023 13:49:46 +0000
Subject: [PATCH 30/59] [Jenkins] Set version to 1.0.2

---
 asm2vec/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/version.py b/asm2vec/version.py
index f1ae280..d6f3f4b 100644
--- a/asm2vec/version.py
+++ b/asm2vec/version.py
@@ -1,2 +1,2 @@
-VERSION = '1.0.1'
+VERSION = '1.0.2'
 DEV_VERSION = '0'

From 8751b197e516153bff72fa6535f738f6457ae121 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 19:52:53 +0200
Subject: [PATCH 31/59] AEGIS-6405 Create test_binary_to_asm.py

---
 test/test_binary_to_asm.py | 141 +++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 test/test_binary_to_asm.py

diff --git a/test/test_binary_to_asm.py b/test/test_binary_to_asm.py
new file mode 100644
index 0000000..f042166
--- /dev/null
+++ b/test/test_binary_to_asm.py
@@ -0,0 +1,141 @@
+from pathlib import Path
+from unittest import TestCase
+from asm2vec.binary_to_asm import (bin_to_asm,
+                                   convert_to_asm,
+                                   _fn_to_asm,
+                                   _normalize,
+                                   _sha3,
+                                   _valid_exe)
+
+
+class TestBinaryToAsm(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        print("\n--- TestBinaryToAsm ---")
+        cls.output_path = 'malware_asm/'
+        cls.pdf_dict = {'name': 'main', 'size': 18, 'addr': 4294974144,
+                        'ops': [{'offset': 4294974144, 'esil': 'rbp,8,rsp,-,=[8],8,rsp,-=', 'refptr': 0,
+                                 'fcn_addr': 4294974144, 'fcn_last': 4294974161, 'size': 1, 'opcode': 'push rbp',
+                                 'disasm': 'push rbp', 'bytes': '55', 'family': 'cpu', 'type': 'rpush',
+                                 'reloc': 'False', 'type_num': 268435468, 'type2_num': 0,
+                                 'flags': ['main', 'entry0', 'section.0.__TEXT.__text', 'sym.func.100001ac0', 'rip'],
+                                 'comment': 'WzAwXSAtci14IHNlY3Rpb24gc2l6ZSA3Mzc2IG5hbWVkIDAuX19URVhULl9fdGV4dA=='},
+                                {'offset': 4294974145, 'esil': 'rsp,rbp,=', 'refptr': 0, 'fcn_addr': 4294974144,
+                                 'fcn_last': 4294974159, 'size': 3, 'opcode': 'mov rbp, rsp', 'disasm': 'mov rbp, rsp',
+                                 'bytes': '4889e5', 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9,
+                                 'type2_num': 0}, {'offset': 4294974148, 'esil': 'rbx,8,rsp,-,=[8],8,rsp,-=',
+                                                   'refptr': 0, 'fcn_addr': 4294974144, 'fcn_last': 4294974161,
+                                                   'size': 1, 'opcode': 'push rbx', 'disasm': 'push rbx', 'bytes': '53',
+                                                   'family': 'cpu', 'type': 'rpush', 'reloc': 'False',
+                                                   'type_num': 268435468, 'type2_num': 0},
+                                {'offset': 4294974149, 'esil': 'rax,8,rsp,-,=[8],8,rsp,-=', 'refptr': 0,
+                                 'fcn_addr': 4294974144, 'fcn_last': 4294974161, 'size': 1, 'opcode': 'push rax',
+                                 'disasm': 'push rax', 'bytes': '50', 'family': 'cpu', 'type': 'rpush',
+                                 'reloc': 'False', 'type_num': 268435468, 'type2_num': 0},
+                                {'offset': 4294974150, 'esil': 'rsi,rbx,=', 'refptr': 0, 'fcn_addr': 4294974144,
+                                 'fcn_last': 4294974159, 'size': 3, 'opcode': 'mov rbx, rsi', 'disasm': 'mov rbx, rsi',
+                                 'bytes': '4889f3', 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9,
+                                 'type2_num': 0}, {'offset': 4294974153, 'ptr': 4294985864,
+                                                   'esil': '0x2db8,rip,+,[8],rax,=', 'refptr': 8,
+                                                   'fcn_addr': 4294974144, 'fcn_last': 4294974155, 'size': 7,
+                                                   'opcode': 'mov rax, qword [rip + 0x2db8]',
+                                                   'disasm': 'mov rax, qword [0x100004888]', 'bytes': '488b05b82d0000',
+                                                   'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9,
+                                                   'type2_num': 0, 'refs': [{'addr': 4294985864, 'type': 'DATA',
+                                                                             'perm': 'r--'}]}, {'offset': 4294974160,
+                                                                                                'esil': 'rax,rip,=',
+                                                                                                'refptr': 0,
+                                                                                                'fcn_addr': 4294974144,
+                                                                                                'fcn_last': 4294974160,
+                                                                                                'size': 2,
+                                                                                                'opcode': 'jmp rax',
+                                                                                                'disasm': 'jmp rax',
+                                                                                                'bytes': 'ffe0',
+                                                                                                'family': 'cpu',
+                                                                                                'type': 'rjmp',
+                                                                                                'reloc': 'False',
+                                                                                                'type_num': 268435458,
+                                                                                                'type2_num': 0}]}
+
+    def test_sha3(self):
+        """Should return 64-character long string"""
+        asm = ("push rbp\n"
+               "mov rbp, rsp\n"
+               "push rbx\n"
+               "push rax\n"
+               "mov rbx, rsi\n"
+               "mov rax, qword [rip + CONST]\n"
+               "jmp rax")
+        self.assertRegex(_sha3(asm), '^[a-f0-9]{64}')
+
+    def test_valid_exe_when_valid_magic_bytes(self):
+        """Should return boolean"""
+        binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
+        filename = Path(binary_location)
+        magic_bytes = ['cffaedfe']
+        self.assertEqual(_valid_exe(filename, magic_bytes), True)
+
+    def test_valid_exe_when_not_valid_magic_bytes(self):
+        """Should return boolean"""
+        binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
+        filename = Path(binary_location)
+        magic_bytes = ['cafebabe']
+        self.assertEqual(_valid_exe(filename, magic_bytes), False)
+
+    def test_normalize_when_offset(self):
+        """Should return normalized opcode"""
+        opcode = "mov rax, qword [rip + 0x2db8]"
+        expected_norm_opcode = "mov rax, qword [rip + CONST]"
+        self.assertEqual(_normalize(opcode), expected_norm_opcode)
+
+    def test_normalize_when_no_offset(self):
+        """Should return normalized opcode"""
+        opcode = 'mov rbx, rsi'
+        expected_norm_opcode = "mov rbx, rsi"
+        self.assertEqual(_normalize(opcode), expected_norm_opcode)
+
+    def test_fn_to_asm_returns_empty_string_when_pdf_none(self):
+        """Should return assembly functions with normalized opcode"""
+        pdf = None
+        asm_min = 5
+        expected_asm = ""
+        self.assertEqual(_fn_to_asm(pdf, asm_min), expected_asm)
+
+    def test_fn_to_asm_returns_empty_string_when_pdfops_shorter_than_minlen(self):
+        """Should return assembly functions with normalized opcode"""
+        asm_minlen = 10
+        expected_asm = ""
+        self.assertEqual(_fn_to_asm(self.pdf_dict, asm_minlen), expected_asm)
+
+    def test_fn_to_asm_returns_expected_asm(self):
+        """Should return assembly functions with normalized opcode"""
+        asm_min = 5
+        expected_asm = (" push rbp\n"
+                        " mov rbp, rsp\n"
+                        " push rbx\n"
+                        " push rax\n"
+                        " mov rbx, rsi\n"
+                        " mov rax, qword [rip + CONST]\n"
+                        " jmp rax\n")
+        self.assertEqual(_fn_to_asm(self.pdf_dict, asm_min), expected_asm)
+
+    def test_bin_to_asm_returns_expected_number_of_disassembled_files(self):
+        binary_location = "malware/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
+        asm_minlen = 5
+        magic_bytes = ['cffaedfe']
+        self.assertEqual(bin_to_asm(Path(binary_location), Path(self.output_path), asm_minlen, magic_bytes), 1)
+
+    def test_bin_to_asm_returns_expected_number_of_disassembled_files_when_pdfops_shorter_than_minlen(self):
+        binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
+        asm_minlen = 10
+        magic_bytes = ['cffaedfe']
+        self.assertEqual(bin_to_asm(Path(binary_location), self.output_path, asm_minlen, magic_bytes), 0)
+
+    def test_convert_to_asm_returns_expected_sha1(self):
+        input_path = 'malware_bin/'
+        asm_minlen_upper = 10
+        asm_minlen_lower = 5
+        expected_sha1 = ["5cca32eb8f9c2a024a57ce12e3fb66070662de80"]
+        self.assertEqual(convert_to_asm(input_path, self.output_path, asm_minlen_upper, asm_minlen_lower),
+                         expected_sha1)

From 0a990f9e25a5b5f20e64484a614bdea503768783 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 19:54:35 +0200
Subject: [PATCH 32/59] AEGIS-6405 Create __init__.py

---
 test/__init__.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 test/__init__.py

diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..472793c
--- /dev/null
+++ b/test/__init__.py
@@ -0,0 +1 @@
+__all__ = ["test_binary_to_asm"]

From 991f9cec04b7a03efae7b2256ba0857b82cd11b2 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:00:43 +0200
Subject: [PATCH 33/59] AEGIS-6405 Create sample_binary

---
 asm2vec/data/sample_binary | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 asm2vec/data/sample_binary

diff --git a/asm2vec/data/sample_binary b/asm2vec/data/sample_binary
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/asm2vec/data/sample_binary
@@ -0,0 +1 @@
+

From 85a8b95014c3248a2678ebb761cbabe21c7f5cf7 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:03:24 +0200
Subject: [PATCH 34/59] AEGIS-6405 upload test binary

This malware file is added for the purposes of test/test_binary_to_asm.py. To use locally, it should be placed at the malware_bin/ directory, as indicated in the unit test.
---
 .../5cca32eb8f9c2a024a57ce12e3fb66070662de80    | Bin 0 -> 33056 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80

diff --git a/asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80
new file mode 100644
index 0000000000000000000000000000000000000000..208607f77c28082e1b391a5c7b16333894760e2d
GIT binary patch
literal 33056
zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e
z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r<ggoH(fgoH%s
zB~Pz1j*w84kOnOY38~P_l3qNXt?9I>3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T
z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe
z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F
zef<A@!+#zxzx#L}sT0|WZU6Ru!T-#8yg;8|51*wx4<9#Q(m#)vZH)KJ_BlJT?cdH1
z{LlQ~ew|`E+w>n6V*Bs6ex){l>Q450V_$6FvJ;Qz<Hhq|<?Z6@#aj{JNA3T8JV|WM
zKR&zv&xx)7ujk3*x%>WGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v
z{n_sMwO@F=rOpA)?3S4RJU<N+oZqq@SkJb9-5>hzoX2zV|Mx2X@B7_29mo6i7{Atw
z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn
z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ
z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK
zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4!
zF@F8n{Y_j$T$cm}IJ<bc@>VzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK-
z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbY<g
zrrKGymYY6V#&FIJ7ZU2K>C(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW
zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr
zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i
z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH
zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<<lOn}GL$
ztAXo*cYr~_2;dFiN?<6k80ZBI1m*$Vfh&L)fG$8!;2EGJ&<&UkoCkCU9tF+<E&?6`
z+5jDZX~5|~d*B}66reRQ8E6c&1SS9{0L_7MKrV1Ha3jzFXb6l1>H)_9*8m3t^?|E^
z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U
zpa^gVuod`&4F}FN;0Is_&<xlJ`~;i?tOdRYjtABN8-SyMRlwK45x@%IbD%D;99Rj|
z0^SBb1gZnC0n32>fQ7(YKxN=%U<ptjcn(+qlmeaxUIK~(j|0yFg@K2Gr-0v8`Gdd#
z;4xr3a4#?u*aF-IOb0drw*gau^}u+xrNiOx;J|ssqAn*E$lt+%Qw@v;@^^6HJOQo)
z#<1bQc>oLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHf<i9(#=
zE~Hq?aOY55VYo9X&N1BS6zUAuoT8QCPNIlpxZ@~l816`l9EPh$v6$g%Q}{4kb&7n3
zt3uIESGgi3y)i5HBosItwk6Mz1zG@wfIWbwz;D|8EyaLFz&2nP8xEXtz-HiA;3(ib
z;Ah});2YpapbqdQ@GVdi_zYMFR0BQ+z5uEK?*pF#m4J7s#c&_dd^p1`rzm8&w<tz1
z+^ZD63^$KrEW<rdaggDjrtoCA$0*$CzV}Y!pSyGo7JuHQ@1n`SOBeH(|G7(_{&%O0
z-*?*Hy@tKn#$cyk@6titThkX_j>VsM>G|Dj*d1-K<IlVFr0y+gr;%9v^DeD|9sYBd
zw(EntbcZ^>gN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN
z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4
zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7
z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oq<Pz
zvw(|$hk!Of2Vfd-I?x`t2RH?24NL|a11*6GzzINeU>uMOoDAFuGyoa`BY}FrF~Bvz
z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-&
z1hfNw0g3=;09%1S*r4Yl;0Is_&<xnfwsbiB9UM55Sk&dz0r@*PaK-~)0QoyOa7F{4
z0&CfD;EVu10@eU^f%kw_KrLV?umY$Kybde}_5&7Ci{aj;Io+j8*apMRr=Ytu`-D$-
zY4(|(?$ReH=q`Pjg6`7k6m*wPp`g1o``}G?>8%ts3^#^??$S{dbeE2xpu05tbezv{
z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7
z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW<uymJJ
zp`g38A_d*0Whm$_EkWVlrG1{FOPlSlYaC&^D%pR=Wf?aqfsS1-D#PkqTGy>%tTxfV
ze(KWR$={FdFw2<!UD`+3g-y@vZ_-LO3;7!zARU=+8Cfj3(6NZFTO;<29XJnZC0(~8
z*p?1w9Iy<i3mgT!1=Io#2bKWUfjYnfU_YQH@DflNs0KU>ln1H+PXVQXO2A`4aiAP9
z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM
z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er
zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{
zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P?
zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi<P
z0lV06-~<7ifnR|uf$xByfnLBjz>h$8;7j0JpbPLBuny=5d<=X6oCmxQd<vWeyaRj$
zv;p1#-UCht76VIxQ-FCu{tgZtW8g&~e+LK71Yj<Zzk>sZ3p`0k7beX=(uGM8LKh~*
zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$
zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq
zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id
zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$
z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax
zZzyQAc|k#=O%(-=HV-Liw7E+`qs<KpcC;y?q>E#M1cd_U4BL|DTmf={*}(Ha1K?3$
z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<<Hy
zj0Sd!^WXa_;5y(JU;r>2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ
z&<@Do!GTi_v<C8baNyhqS_1hyIB>23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a)
zHbeOhEt=D4qeelaO<xKcZF*7AXd_KQqm4L4a+mf<JP&-8<zH4y*srO}dB?VNIG=%w
zfp35xf%Abcfp3AcfzPPLaO-FuMK^m2ABI~=A<S?eP<S!iI~23&qNb2xxJ49Z4EHhx
zJtCi__{MNgQYbOpqZG0X_aMawhP#)do#F1J2yatPqF}h2X?}@51RE*olTpK=P~e2J
zEqP8PunHImTm!5Ct^lqAmIFP30l?cpH=r-@8qgWI99Rfk1at*n208$pfaier)S~bG
z(VRz59TdIjse?k9o;oOK05GSZKA%JpLr)zPR`l3RK|_-s1r0db6qWSYOtG3Cn<*sd
zv6+GfLm7%(>de0MP)y}nD@LKq>B+WqIEuh9AO|Q53<e4TdjS1_-{={SMm93);ZAB$
z4|h;d54TWI55J?J9)3eXJ^X@#dbo;$dYFBVrXIdaK|Oqff_k`!f_nHe1@$oduuDD6
zKHf6iESj@De29|XLMQ#G2KDd`wk6L=1AYM}0`~x0ft!KJzz@Jpzyx3;a6K>%SPP5*
zZUoi<LxGXNDqtXR4X^^Zf?D)nd^D#X_NSm8_NJg7cBh~ocBY^nUPwVbJcojMcqRq)
z@N^35VRH)V;Yk$K!{aEZheuLS59?9nc4<c|P(9Qc`tKn6&zw-YEp};#(R@anD*Jq3
zI;%^Y&G=>P1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r
z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77
zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxl<T(8v*LJe;hYBYT35$v5
z&!sYJto^>GolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m
zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH
zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg
zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh-
z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE
z!wn<XzpMu=P4D!oZ5So-a`mx@$o7ui2c|q(|89+2>$&putk)7(^78$rj4R0bD^*h^
z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR|
zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl
zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aA<S0OA_
zAeX<U@T&G$U#)ds(`sC@x9IeETyk>q?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h)
zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p
zTU+#Wi>_BzeE-<Khf89$+LL>%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1
zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7
zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O
z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4!
zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfP<Q`#zpuB)w^s
zW67lf0f}3LJvHS&ROzYanOt+>m7h0UQ{3x<LFvjTD*JOQLR6kqyA1W_>b0lb4zMfT
zF<P@FZ0_Fn(^Yk@wVG4bUYW4ZTituU<u%d3((`#8hDUjE&$m{Wg-?`K2-8oU!g&_@
zqQ`&$(}=9rK24{t)TzpEcD6L!r5-2xW6Fiv<|bRa4K+PyN|$VYGVXl&sqrHmkKFBb
z%<n^H&edVAYlSbaTb~y{D6CV>u(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz?
zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi
zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-X<qJ0JA=7T
zxJd!#6D5j+KUzhIFB^D!&|9vdi{I8($wdho*=A?XRHwPct@tRu$mC;Y;TdPAD-ZMo
zVuz`fbp`3AKI*ddlxgz)cyg_|%O;<V`)q2SxUV1PI%dY{oGEATzRGLfzuI)e`Rtt&
zPn^1~92r0AMs@fq<<C_f7Q+rm512VBcJ-(`rLIGieX{o%Z?2cU_bTN{_B-<}+ipJ`
z<~`=I!I37B%g<!IEgp$RF0}NNS+k<wilVt1?&}t&E?(aHxOJv@qGht_#>S8`CBJcV
zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d
z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{z<KnLM?B)_VmpUg?j79&x*N
zT&o{AEqV9FvxV+OH$;1Bbj7u{AC5khHqcnI_4cAICDA43!m{F`t}iP@*KSH%b+~!C
z-^BFBz^;9ZrWN(S=3m&nyM{TV+rC?IlDK4WZMbHF@1nc0Q5$WUg!$V>UWiIL`JtCs
zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*<g6+0W%9NzJL~v1
zzE_u4=44YfgQI0e@~@8Dx`!$EcpAK5DDR=<@VB7{cXBj-Xvr^sko>emV(FNm=<S@s
zy=(29_U}q**}mQRn7H!y`fF29eHfwmpmFbzL2`{gcRvIb+^xJd>FdX;oaiaxVd<(x
z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU
z_RpjGAIy9AyzG>1SpV^<dt~1%S|Y|}jOv$_ZEnxh%JpwvH1PS7(w#3xEmOBj8muaD
zZRT<_)!Fe~L2nato)z~QmXQ(iUDy6en(x?(3HD9X-={mfN!`?PK5G`H>tiLGcHA&<
ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8
zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t!
z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA
zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G<tQe;q|6Q;fRpu&udRBo_WyA
zPG{=TMu)56bK|?lO;#GRFJ7W`rGKoXda$9a#JULq^F)rC+g!JjkT=oZu)5whb8Bn)
zu*AKQ*4AzBT_#>)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru
zei<w7IePBkEUmw92KC;y_TkO-@0Hc0nK9*$Mb2g~9Mrb%g|~%RZi@Rorl7LU;7q-k
z^1g|7J=I)x85F-iH#{p`?W@)6kjtYlN!(dAEhFzqNmW_3GcQKt;n~=Ar^_F_>qxRa
z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c
z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rf<B<
z%NxO(`?vdN&EM>yv0P>JlajyQFMDU4XKE7EwQjfd%#OoeS<x?VcKYpm{z+bV`eELa
z{T*TB8<Z}b&&+cQ>peOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5
zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsT<MLpl~v_A_EN4<&3&&po2(TR<sDMlJiw!U
zSzL^*O<Y>E-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg
zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs
zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E
ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8
z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ
zkHbLIj|b0Xy<aF(&}5jUz3KH!(;a6Hjvn%tfrp3c_=M2q_ev~w-*;E5+%j%iKerQI
zSHC`J;?4PVSIxqv)7A1tU*}}i%cpE+3T+Rz4y-hr`|3OA-o^=AS0@y@n>dQhjk@Qp
zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^<Y%df*y-uO@L@H#%k
ze#H9=LwAR*oH^K~rCDr9hRTV?WBuiin&{m$3=gpuma3K!eX-iZwXgK6u}g|Ocww2v
zyT*LIxLhP)j<W3CesA|?y*<AB>MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O|
zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV?
z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo
zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r
z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z
zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ
zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Yn<r!77D+U}|F!e!d
z<$0T`%{^^p9THQtgon>r>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD
zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6
zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR
ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_
zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7<Qd
zR5d^2o;f1Ni<kHpU%A%wII-#3+F47TF4X@u^2~xNvF344HTInRCYNl~OG4*mLf^Jr
zEn~lyO}1NoB6x!fRA1&F{wCsLH{g+5A9we%7!Pl~w$x6qcdbol^CgYWHdG{kP(Ii&
zaO}d&!b6wZiavZ=q&l`L)9_0}!)bl*5o0Eq-*0mJQuEO4!A%K8`_jTi>lt0{iNUoF
z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI
za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK<fmlg%@Jw4;L;ks7eXJVf@+@LvEKhAKg
zjt!VO%PI8uyz%BEzph@8Bc1;?NT_VK3n%+!UaXz41?T-2&$pL<3|}IB?c1m5x?{>!
z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX
z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm
zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ
z**<VWt*4o$x7OV)efEA!>Z<Xwe)MR_4ik+7krqlvEJm&!wxv|5w?U@0T>U6jNAaLx
zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+<g^cNyAQ
zGw-;4U+1_`((8I@veM|-(|a3AYK||z(y`mD^X@hoo0p@<mkm9dFC}U|ukG3aKZRJ^
z{Y?iQXAX!?sZf{d{bZ*2SW)Y-;Rh-tOAB~<*C($%<oIs3xJ9(J`yWpq_kRA~p6-0z
zosYY-?GOL{hX>Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp
z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;&
z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx
zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c
zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E
zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5<g}-SoCrrO*k0$o#_SnCV
zP023uCk>ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY
KhC61I(EkBO5z4Rt

literal 0
HcmV?d00001


From 38cf710460dbb3615b237b4028462623646da520 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:03:47 +0200
Subject: [PATCH 35/59] AEGIS-6405 Delete asm2vec/data/sample_binary

---
 asm2vec/data/sample_binary | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 asm2vec/data/sample_binary

diff --git a/asm2vec/data/sample_binary b/asm2vec/data/sample_binary
deleted file mode 100644
index 8b13789..0000000
--- a/asm2vec/data/sample_binary
+++ /dev/null
@@ -1 +0,0 @@
-

From 6f73e51a6a1e080cdcf457658850c8dc241c8e8b Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:04:50 +0200
Subject: [PATCH 36/59] AEGIS-6405 Delete asm2vec/data directory

---
 .../5cca32eb8f9c2a024a57ce12e3fb66070662de80    | Bin 33056 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80

diff --git a/asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/asm2vec/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80
deleted file mode 100644
index 208607f77c28082e1b391a5c7b16333894760e2d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 33056
zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e
z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r<ggoH(fgoH%s
zB~Pz1j*w84kOnOY38~P_l3qNXt?9I>3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T
z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe
z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F
zef<A@!+#zxzx#L}sT0|WZU6Ru!T-#8yg;8|51*wx4<9#Q(m#)vZH)KJ_BlJT?cdH1
z{LlQ~ew|`E+w>n6V*Bs6ex){l>Q450V_$6FvJ;Qz<Hhq|<?Z6@#aj{JNA3T8JV|WM
zKR&zv&xx)7ujk3*x%>WGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v
z{n_sMwO@F=rOpA)?3S4RJU<N+oZqq@SkJb9-5>hzoX2zV|Mx2X@B7_29mo6i7{Atw
z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn
z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ
z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK
zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4!
zF@F8n{Y_j$T$cm}IJ<bc@>VzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK-
z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbY<g
zrrKGymYY6V#&FIJ7ZU2K>C(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW
zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr
zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i
z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH
zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<<lOn}GL$
ztAXo*cYr~_2;dFiN?<6k80ZBI1m*$Vfh&L)fG$8!;2EGJ&<&UkoCkCU9tF+<E&?6`
z+5jDZX~5|~d*B}66reRQ8E6c&1SS9{0L_7MKrV1Ha3jzFXb6l1>H)_9*8m3t^?|E^
z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U
zpa^gVuod`&4F}FN;0Is_&<xlJ`~;i?tOdRYjtABN8-SyMRlwK45x@%IbD%D;99Rj|
z0^SBb1gZnC0n32>fQ7(YKxN=%U<ptjcn(+qlmeaxUIK~(j|0yFg@K2Gr-0v8`Gdd#
z;4xr3a4#?u*aF-IOb0drw*gau^}u+xrNiOx;J|ssqAn*E$lt+%Qw@v;@^^6HJOQo)
z#<1bQc>oLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHf<i9(#=
zE~Hq?aOY55VYo9X&N1BS6zUAuoT8QCPNIlpxZ@~l816`l9EPh$v6$g%Q}{4kb&7n3
zt3uIESGgi3y)i5HBosItwk6Mz1zG@wfIWbwz;D|8EyaLFz&2nP8xEXtz-HiA;3(ib
z;Ah});2YpapbqdQ@GVdi_zYMFR0BQ+z5uEK?*pF#m4J7s#c&_dd^p1`rzm8&w<tz1
z+^ZD63^$KrEW<rdaggDjrtoCA$0*$CzV}Y!pSyGo7JuHQ@1n`SOBeH(|G7(_{&%O0
z-*?*Hy@tKn#$cyk@6titThkX_j>VsM>G|Dj*d1-K<IlVFr0y+gr;%9v^DeD|9sYBd
zw(EntbcZ^>gN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN
z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4
zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7
z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oq<Pz
zvw(|$hk!Of2Vfd-I?x`t2RH?24NL|a11*6GzzINeU>uMOoDAFuGyoa`BY}FrF~Bvz
z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-&
z1hfNw0g3=;09%1S*r4Yl;0Is_&<xnfwsbiB9UM55Sk&dz0r@*PaK-~)0QoyOa7F{4
z0&CfD;EVu10@eU^f%kw_KrLV?umY$Kybde}_5&7Ci{aj;Io+j8*apMRr=Ytu`-D$-
zY4(|(?$ReH=q`Pjg6`7k6m*wPp`g1o``}G?>8%ts3^#^??$S{dbeE2xpu05tbezv{
z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7
z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW<uymJJ
zp`g38A_d*0Whm$_EkWVlrG1{FOPlSlYaC&^D%pR=Wf?aqfsS1-D#PkqTGy>%tTxfV
ze(KWR$={FdFw2<!UD`+3g-y@vZ_-LO3;7!zARU=+8Cfj3(6NZFTO;<29XJnZC0(~8
z*p?1w9Iy<i3mgT!1=Io#2bKWUfjYnfU_YQH@DflNs0KU>ln1H+PXVQXO2A`4aiAP9
z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM
z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er
zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{
zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P?
zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi<P
z0lV06-~<7ifnR|uf$xByfnLBjz>h$8;7j0JpbPLBuny=5d<=X6oCmxQd<vWeyaRj$
zv;p1#-UCht76VIxQ-FCu{tgZtW8g&~e+LK71Yj<Zzk>sZ3p`0k7beX=(uGM8LKh~*
zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$
zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq
zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id
zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$
z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax
zZzyQAc|k#=O%(-=HV-Liw7E+`qs<KpcC;y?q>E#M1cd_U4BL|DTmf={*}(Ha1K?3$
z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<<Hy
zj0Sd!^WXa_;5y(JU;r>2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ
z&<@Do!GTi_v<C8baNyhqS_1hyIB>23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a)
zHbeOhEt=D4qeelaO<xKcZF*7AXd_KQqm4L4a+mf<JP&-8<zH4y*srO}dB?VNIG=%w
zfp35xf%Abcfp3AcfzPPLaO-FuMK^m2ABI~=A<S?eP<S!iI~23&qNb2xxJ49Z4EHhx
zJtCi__{MNgQYbOpqZG0X_aMawhP#)do#F1J2yatPqF}h2X?}@51RE*olTpK=P~e2J
zEqP8PunHImTm!5Ct^lqAmIFP30l?cpH=r-@8qgWI99Rfk1at*n208$pfaier)S~bG
z(VRz59TdIjse?k9o;oOK05GSZKA%JpLr)zPR`l3RK|_-s1r0db6qWSYOtG3Cn<*sd
zv6+GfLm7%(>de0MP)y}nD@LKq>B+WqIEuh9AO|Q53<e4TdjS1_-{={SMm93);ZAB$
z4|h;d54TWI55J?J9)3eXJ^X@#dbo;$dYFBVrXIdaK|Oqff_k`!f_nHe1@$oduuDD6
zKHf6iESj@De29|XLMQ#G2KDd`wk6L=1AYM}0`~x0ft!KJzz@Jpzyx3;a6K>%SPP5*
zZUoi<LxGXNDqtXR4X^^Zf?D)nd^D#X_NSm8_NJg7cBh~ocBY^nUPwVbJcojMcqRq)
z@N^35VRH)V;Yk$K!{aEZheuLS59?9nc4<c|P(9Qc`tKn6&zw-YEp};#(R@anD*Jq3
zI;%^Y&G=>P1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r
z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77
zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxl<T(8v*LJe;hYBYT35$v5
z&!sYJto^>GolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m
zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH
zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg
zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh-
z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE
z!wn<XzpMu=P4D!oZ5So-a`mx@$o7ui2c|q(|89+2>$&putk)7(^78$rj4R0bD^*h^
z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR|
zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl
zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aA<S0OA_
zAeX<U@T&G$U#)ds(`sC@x9IeETyk>q?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h)
zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p
zTU+#Wi>_BzeE-<Khf89$+LL>%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1
zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7
zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O
z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4!
zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfP<Q`#zpuB)w^s
zW67lf0f}3LJvHS&ROzYanOt+>m7h0UQ{3x<LFvjTD*JOQLR6kqyA1W_>b0lb4zMfT
zF<P@FZ0_Fn(^Yk@wVG4bUYW4ZTituU<u%d3((`#8hDUjE&$m{Wg-?`K2-8oU!g&_@
zqQ`&$(}=9rK24{t)TzpEcD6L!r5-2xW6Fiv<|bRa4K+PyN|$VYGVXl&sqrHmkKFBb
z%<n^H&edVAYlSbaTb~y{D6CV>u(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz?
zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi
zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-X<qJ0JA=7T
zxJd!#6D5j+KUzhIFB^D!&|9vdi{I8($wdho*=A?XRHwPct@tRu$mC;Y;TdPAD-ZMo
zVuz`fbp`3AKI*ddlxgz)cyg_|%O;<V`)q2SxUV1PI%dY{oGEATzRGLfzuI)e`Rtt&
zPn^1~92r0AMs@fq<<C_f7Q+rm512VBcJ-(`rLIGieX{o%Z?2cU_bTN{_B-<}+ipJ`
z<~`=I!I37B%g<!IEgp$RF0}NNS+k<wilVt1?&}t&E?(aHxOJv@qGht_#>S8`CBJcV
zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d
z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{z<KnLM?B)_VmpUg?j79&x*N
zT&o{AEqV9FvxV+OH$;1Bbj7u{AC5khHqcnI_4cAICDA43!m{F`t}iP@*KSH%b+~!C
z-^BFBz^;9ZrWN(S=3m&nyM{TV+rC?IlDK4WZMbHF@1nc0Q5$WUg!$V>UWiIL`JtCs
zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*<g6+0W%9NzJL~v1
zzE_u4=44YfgQI0e@~@8Dx`!$EcpAK5DDR=<@VB7{cXBj-Xvr^sko>emV(FNm=<S@s
zy=(29_U}q**}mQRn7H!y`fF29eHfwmpmFbzL2`{gcRvIb+^xJd>FdX;oaiaxVd<(x
z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU
z_RpjGAIy9AyzG>1SpV^<dt~1%S|Y|}jOv$_ZEnxh%JpwvH1PS7(w#3xEmOBj8muaD
zZRT<_)!Fe~L2nato)z~QmXQ(iUDy6en(x?(3HD9X-={mfN!`?PK5G`H>tiLGcHA&<
ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8
zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t!
z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA
zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G<tQe;q|6Q;fRpu&udRBo_WyA
zPG{=TMu)56bK|?lO;#GRFJ7W`rGKoXda$9a#JULq^F)rC+g!JjkT=oZu)5whb8Bn)
zu*AKQ*4AzBT_#>)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru
zei<w7IePBkEUmw92KC;y_TkO-@0Hc0nK9*$Mb2g~9Mrb%g|~%RZi@Rorl7LU;7q-k
z^1g|7J=I)x85F-iH#{p`?W@)6kjtYlN!(dAEhFzqNmW_3GcQKt;n~=Ar^_F_>qxRa
z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c
z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rf<B<
z%NxO(`?vdN&EM>yv0P>JlajyQFMDU4XKE7EwQjfd%#OoeS<x?VcKYpm{z+bV`eELa
z{T*TB8<Z}b&&+cQ>peOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5
zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsT<MLpl~v_A_EN4<&3&&po2(TR<sDMlJiw!U
zSzL^*O<Y>E-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg
zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs
zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E
ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8
z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ
zkHbLIj|b0Xy<aF(&}5jUz3KH!(;a6Hjvn%tfrp3c_=M2q_ev~w-*;E5+%j%iKerQI
zSHC`J;?4PVSIxqv)7A1tU*}}i%cpE+3T+Rz4y-hr`|3OA-o^=AS0@y@n>dQhjk@Qp
zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^<Y%df*y-uO@L@H#%k
ze#H9=LwAR*oH^K~rCDr9hRTV?WBuiin&{m$3=gpuma3K!eX-iZwXgK6u}g|Ocww2v
zyT*LIxLhP)j<W3CesA|?y*<AB>MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O|
zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV?
z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo
zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r
z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z
zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ
zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Yn<r!77D+U}|F!e!d
z<$0T`%{^^p9THQtgon>r>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD
zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6
zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR
ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_
zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7<Qd
zR5d^2o;f1Ni<kHpU%A%wII-#3+F47TF4X@u^2~xNvF344HTInRCYNl~OG4*mLf^Jr
zEn~lyO}1NoB6x!fRA1&F{wCsLH{g+5A9we%7!Pl~w$x6qcdbol^CgYWHdG{kP(Ii&
zaO}d&!b6wZiavZ=q&l`L)9_0}!)bl*5o0Eq-*0mJQuEO4!A%K8`_jTi>lt0{iNUoF
z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI
za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK<fmlg%@Jw4;L;ks7eXJVf@+@LvEKhAKg
zjt!VO%PI8uyz%BEzph@8Bc1;?NT_VK3n%+!UaXz41?T-2&$pL<3|}IB?c1m5x?{>!
z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX
z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm
zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ
z**<VWt*4o$x7OV)efEA!>Z<Xwe)MR_4ik+7krqlvEJm&!wxv|5w?U@0T>U6jNAaLx
zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+<g^cNyAQ
zGw-;4U+1_`((8I@veM|-(|a3AYK||z(y`mD^X@hoo0p@<mkm9dFC}U|ukG3aKZRJ^
z{Y?iQXAX!?sZf{d{bZ*2SW)Y-;Rh-tOAB~<*C($%<oIs3xJ9(J`yWpq_kRA~p6-0z
zosYY-?GOL{hX>Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp
z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;&
z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx
zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c
zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E
zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5<g}-SoCrrO*k0$o#_SnCV
zP023uCk>ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY
KhC61I(EkBO5z4Rt


From d9f3f998ecd92182059e25cc5f1ccab98a96a3e9 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:05:18 +0200
Subject: [PATCH 37/59] AEGIS-6405 Create sample_binary

---
 data/sample_binary | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 data/sample_binary

diff --git a/data/sample_binary b/data/sample_binary
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/data/sample_binary
@@ -0,0 +1 @@
+

From 383dfedbe5196fe5be8400eadb1341965a482aa1 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:07:14 +0200
Subject: [PATCH 38/59] Add files via upload

Adding malware file for the purposes of test/test_binary_to_asm.py. To use locally, the binary shall be placed in the malware_bin/ folder, as indicated in the unit test.
---
 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 | Bin 0 -> 33056 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80

diff --git a/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80
new file mode 100644
index 0000000000000000000000000000000000000000..208607f77c28082e1b391a5c7b16333894760e2d
GIT binary patch
literal 33056
zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e
z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r<ggoH(fgoH%s
zB~Pz1j*w84kOnOY38~P_l3qNXt?9I>3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T
z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe
z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F
zef<A@!+#zxzx#L}sT0|WZU6Ru!T-#8yg;8|51*wx4<9#Q(m#)vZH)KJ_BlJT?cdH1
z{LlQ~ew|`E+w>n6V*Bs6ex){l>Q450V_$6FvJ;Qz<Hhq|<?Z6@#aj{JNA3T8JV|WM
zKR&zv&xx)7ujk3*x%>WGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v
z{n_sMwO@F=rOpA)?3S4RJU<N+oZqq@SkJb9-5>hzoX2zV|Mx2X@B7_29mo6i7{Atw
z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn
z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ
z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK
zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4!
zF@F8n{Y_j$T$cm}IJ<bc@>VzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK-
z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbY<g
zrrKGymYY6V#&FIJ7ZU2K>C(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW
zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr
zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i
z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH
zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<<lOn}GL$
ztAXo*cYr~_2;dFiN?<6k80ZBI1m*$Vfh&L)fG$8!;2EGJ&<&UkoCkCU9tF+<E&?6`
z+5jDZX~5|~d*B}66reRQ8E6c&1SS9{0L_7MKrV1Ha3jzFXb6l1>H)_9*8m3t^?|E^
z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U
zpa^gVuod`&4F}FN;0Is_&<xlJ`~;i?tOdRYjtABN8-SyMRlwK45x@%IbD%D;99Rj|
z0^SBb1gZnC0n32>fQ7(YKxN=%U<ptjcn(+qlmeaxUIK~(j|0yFg@K2Gr-0v8`Gdd#
z;4xr3a4#?u*aF-IOb0drw*gau^}u+xrNiOx;J|ssqAn*E$lt+%Qw@v;@^^6HJOQo)
z#<1bQc>oLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHf<i9(#=
zE~Hq?aOY55VYo9X&N1BS6zUAuoT8QCPNIlpxZ@~l816`l9EPh$v6$g%Q}{4kb&7n3
zt3uIESGgi3y)i5HBosItwk6Mz1zG@wfIWbwz;D|8EyaLFz&2nP8xEXtz-HiA;3(ib
z;Ah});2YpapbqdQ@GVdi_zYMFR0BQ+z5uEK?*pF#m4J7s#c&_dd^p1`rzm8&w<tz1
z+^ZD63^$KrEW<rdaggDjrtoCA$0*$CzV}Y!pSyGo7JuHQ@1n`SOBeH(|G7(_{&%O0
z-*?*Hy@tKn#$cyk@6titThkX_j>VsM>G|Dj*d1-K<IlVFr0y+gr;%9v^DeD|9sYBd
zw(EntbcZ^>gN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN
z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4
zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7
z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oq<Pz
zvw(|$hk!Of2Vfd-I?x`t2RH?24NL|a11*6GzzINeU>uMOoDAFuGyoa`BY}FrF~Bvz
z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-&
z1hfNw0g3=;09%1S*r4Yl;0Is_&<xnfwsbiB9UM55Sk&dz0r@*PaK-~)0QoyOa7F{4
z0&CfD;EVu10@eU^f%kw_KrLV?umY$Kybde}_5&7Ci{aj;Io+j8*apMRr=Ytu`-D$-
zY4(|(?$ReH=q`Pjg6`7k6m*wPp`g1o``}G?>8%ts3^#^??$S{dbeE2xpu05tbezv{
z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7
z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW<uymJJ
zp`g38A_d*0Whm$_EkWVlrG1{FOPlSlYaC&^D%pR=Wf?aqfsS1-D#PkqTGy>%tTxfV
ze(KWR$={FdFw2<!UD`+3g-y@vZ_-LO3;7!zARU=+8Cfj3(6NZFTO;<29XJnZC0(~8
z*p?1w9Iy<i3mgT!1=Io#2bKWUfjYnfU_YQH@DflNs0KU>ln1H+PXVQXO2A`4aiAP9
z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM
z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er
zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{
zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P?
zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi<P
z0lV06-~<7ifnR|uf$xByfnLBjz>h$8;7j0JpbPLBuny=5d<=X6oCmxQd<vWeyaRj$
zv;p1#-UCht76VIxQ-FCu{tgZtW8g&~e+LK71Yj<Zzk>sZ3p`0k7beX=(uGM8LKh~*
zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$
zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq
zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id
zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$
z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax
zZzyQAc|k#=O%(-=HV-Liw7E+`qs<KpcC;y?q>E#M1cd_U4BL|DTmf={*}(Ha1K?3$
z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<<Hy
zj0Sd!^WXa_;5y(JU;r>2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ
z&<@Do!GTi_v<C8baNyhqS_1hyIB>23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a)
zHbeOhEt=D4qeelaO<xKcZF*7AXd_KQqm4L4a+mf<JP&-8<zH4y*srO}dB?VNIG=%w
zfp35xf%Abcfp3AcfzPPLaO-FuMK^m2ABI~=A<S?eP<S!iI~23&qNb2xxJ49Z4EHhx
zJtCi__{MNgQYbOpqZG0X_aMawhP#)do#F1J2yatPqF}h2X?}@51RE*olTpK=P~e2J
zEqP8PunHImTm!5Ct^lqAmIFP30l?cpH=r-@8qgWI99Rfk1at*n208$pfaier)S~bG
z(VRz59TdIjse?k9o;oOK05GSZKA%JpLr)zPR`l3RK|_-s1r0db6qWSYOtG3Cn<*sd
zv6+GfLm7%(>de0MP)y}nD@LKq>B+WqIEuh9AO|Q53<e4TdjS1_-{={SMm93);ZAB$
z4|h;d54TWI55J?J9)3eXJ^X@#dbo;$dYFBVrXIdaK|Oqff_k`!f_nHe1@$oduuDD6
zKHf6iESj@De29|XLMQ#G2KDd`wk6L=1AYM}0`~x0ft!KJzz@Jpzyx3;a6K>%SPP5*
zZUoi<LxGXNDqtXR4X^^Zf?D)nd^D#X_NSm8_NJg7cBh~ocBY^nUPwVbJcojMcqRq)
z@N^35VRH)V;Yk$K!{aEZheuLS59?9nc4<c|P(9Qc`tKn6&zw-YEp};#(R@anD*Jq3
zI;%^Y&G=>P1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r
z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77
zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxl<T(8v*LJe;hYBYT35$v5
z&!sYJto^>GolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m
zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH
zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg
zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh-
z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE
z!wn<XzpMu=P4D!oZ5So-a`mx@$o7ui2c|q(|89+2>$&putk)7(^78$rj4R0bD^*h^
z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR|
zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl
zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aA<S0OA_
zAeX<U@T&G$U#)ds(`sC@x9IeETyk>q?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h)
zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p
zTU+#Wi>_BzeE-<Khf89$+LL>%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1
zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7
zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O
z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4!
zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfP<Q`#zpuB)w^s
zW67lf0f}3LJvHS&ROzYanOt+>m7h0UQ{3x<LFvjTD*JOQLR6kqyA1W_>b0lb4zMfT
zF<P@FZ0_Fn(^Yk@wVG4bUYW4ZTituU<u%d3((`#8hDUjE&$m{Wg-?`K2-8oU!g&_@
zqQ`&$(}=9rK24{t)TzpEcD6L!r5-2xW6Fiv<|bRa4K+PyN|$VYGVXl&sqrHmkKFBb
z%<n^H&edVAYlSbaTb~y{D6CV>u(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz?
zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi
zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-X<qJ0JA=7T
zxJd!#6D5j+KUzhIFB^D!&|9vdi{I8($wdho*=A?XRHwPct@tRu$mC;Y;TdPAD-ZMo
zVuz`fbp`3AKI*ddlxgz)cyg_|%O;<V`)q2SxUV1PI%dY{oGEATzRGLfzuI)e`Rtt&
zPn^1~92r0AMs@fq<<C_f7Q+rm512VBcJ-(`rLIGieX{o%Z?2cU_bTN{_B-<}+ipJ`
z<~`=I!I37B%g<!IEgp$RF0}NNS+k<wilVt1?&}t&E?(aHxOJv@qGht_#>S8`CBJcV
zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d
z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{z<KnLM?B)_VmpUg?j79&x*N
zT&o{AEqV9FvxV+OH$;1Bbj7u{AC5khHqcnI_4cAICDA43!m{F`t}iP@*KSH%b+~!C
z-^BFBz^;9ZrWN(S=3m&nyM{TV+rC?IlDK4WZMbHF@1nc0Q5$WUg!$V>UWiIL`JtCs
zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*<g6+0W%9NzJL~v1
zzE_u4=44YfgQI0e@~@8Dx`!$EcpAK5DDR=<@VB7{cXBj-Xvr^sko>emV(FNm=<S@s
zy=(29_U}q**}mQRn7H!y`fF29eHfwmpmFbzL2`{gcRvIb+^xJd>FdX;oaiaxVd<(x
z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU
z_RpjGAIy9AyzG>1SpV^<dt~1%S|Y|}jOv$_ZEnxh%JpwvH1PS7(w#3xEmOBj8muaD
zZRT<_)!Fe~L2nato)z~QmXQ(iUDy6en(x?(3HD9X-={mfN!`?PK5G`H>tiLGcHA&<
ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8
zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t!
z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA
zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G<tQe;q|6Q;fRpu&udRBo_WyA
zPG{=TMu)56bK|?lO;#GRFJ7W`rGKoXda$9a#JULq^F)rC+g!JjkT=oZu)5whb8Bn)
zu*AKQ*4AzBT_#>)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru
zei<w7IePBkEUmw92KC;y_TkO-@0Hc0nK9*$Mb2g~9Mrb%g|~%RZi@Rorl7LU;7q-k
z^1g|7J=I)x85F-iH#{p`?W@)6kjtYlN!(dAEhFzqNmW_3GcQKt;n~=Ar^_F_>qxRa
z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c
z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rf<B<
z%NxO(`?vdN&EM>yv0P>JlajyQFMDU4XKE7EwQjfd%#OoeS<x?VcKYpm{z+bV`eELa
z{T*TB8<Z}b&&+cQ>peOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5
zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsT<MLpl~v_A_EN4<&3&&po2(TR<sDMlJiw!U
zSzL^*O<Y>E-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg
zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs
zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E
ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8
z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ
zkHbLIj|b0Xy<aF(&}5jUz3KH!(;a6Hjvn%tfrp3c_=M2q_ev~w-*;E5+%j%iKerQI
zSHC`J;?4PVSIxqv)7A1tU*}}i%cpE+3T+Rz4y-hr`|3OA-o^=AS0@y@n>dQhjk@Qp
zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^<Y%df*y-uO@L@H#%k
ze#H9=LwAR*oH^K~rCDr9hRTV?WBuiin&{m$3=gpuma3K!eX-iZwXgK6u}g|Ocww2v
zyT*LIxLhP)j<W3CesA|?y*<AB>MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O|
zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV?
z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo
zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r
z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z
zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ
zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Yn<r!77D+U}|F!e!d
z<$0T`%{^^p9THQtgon>r>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD
zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6
zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR
ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_
zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7<Qd
zR5d^2o;f1Ni<kHpU%A%wII-#3+F47TF4X@u^2~xNvF344HTInRCYNl~OG4*mLf^Jr
zEn~lyO}1NoB6x!fRA1&F{wCsLH{g+5A9we%7!Pl~w$x6qcdbol^CgYWHdG{kP(Ii&
zaO}d&!b6wZiavZ=q&l`L)9_0}!)bl*5o0Eq-*0mJQuEO4!A%K8`_jTi>lt0{iNUoF
z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI
za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK<fmlg%@Jw4;L;ks7eXJVf@+@LvEKhAKg
zjt!VO%PI8uyz%BEzph@8Bc1;?NT_VK3n%+!UaXz41?T-2&$pL<3|}IB?c1m5x?{>!
z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX
z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm
zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ
z**<VWt*4o$x7OV)efEA!>Z<Xwe)MR_4ik+7krqlvEJm&!wxv|5w?U@0T>U6jNAaLx
zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+<g^cNyAQ
zGw-;4U+1_`((8I@veM|-(|a3AYK||z(y`mD^X@hoo0p@<mkm9dFC}U|ukG3aKZRJ^
z{Y?iQXAX!?sZf{d{bZ*2SW)Y-;Rh-tOAB~<*C($%<oIs3xJ9(J`yWpq_kRA~p6-0z
zosYY-?GOL{hX>Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp
z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;&
z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx
zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c
zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E
zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5<g}-SoCrrO*k0$o#_SnCV
zP023uCk>ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY
KhC61I(EkBO5z4Rt

literal 0
HcmV?d00001


From b057394aa16e9af666f9aeff5e0aa3da5fbcb209 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:08:02 +0200
Subject: [PATCH 39/59] AEGIS-6405 Delete
 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80

---
 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 | Bin 33056 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80

diff --git a/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80
deleted file mode 100644
index 208607f77c28082e1b391a5c7b16333894760e2d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 33056
zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e
z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r<ggoH(fgoH%s
zB~Pz1j*w84kOnOY38~P_l3qNXt?9I>3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T
z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe
z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F
zef<A@!+#zxzx#L}sT0|WZU6Ru!T-#8yg;8|51*wx4<9#Q(m#)vZH)KJ_BlJT?cdH1
z{LlQ~ew|`E+w>n6V*Bs6ex){l>Q450V_$6FvJ;Qz<Hhq|<?Z6@#aj{JNA3T8JV|WM
zKR&zv&xx)7ujk3*x%>WGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v
z{n_sMwO@F=rOpA)?3S4RJU<N+oZqq@SkJb9-5>hzoX2zV|Mx2X@B7_29mo6i7{Atw
z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn
z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ
z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK
zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4!
zF@F8n{Y_j$T$cm}IJ<bc@>VzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK-
z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbY<g
zrrKGymYY6V#&FIJ7ZU2K>C(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW
zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr
zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i
z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH
zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<<lOn}GL$
ztAXo*cYr~_2;dFiN?<6k80ZBI1m*$Vfh&L)fG$8!;2EGJ&<&UkoCkCU9tF+<E&?6`
z+5jDZX~5|~d*B}66reRQ8E6c&1SS9{0L_7MKrV1Ha3jzFXb6l1>H)_9*8m3t^?|E^
z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U
zpa^gVuod`&4F}FN;0Is_&<xlJ`~;i?tOdRYjtABN8-SyMRlwK45x@%IbD%D;99Rj|
z0^SBb1gZnC0n32>fQ7(YKxN=%U<ptjcn(+qlmeaxUIK~(j|0yFg@K2Gr-0v8`Gdd#
z;4xr3a4#?u*aF-IOb0drw*gau^}u+xrNiOx;J|ssqAn*E$lt+%Qw@v;@^^6HJOQo)
z#<1bQc>oLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHf<i9(#=
zE~Hq?aOY55VYo9X&N1BS6zUAuoT8QCPNIlpxZ@~l816`l9EPh$v6$g%Q}{4kb&7n3
zt3uIESGgi3y)i5HBosItwk6Mz1zG@wfIWbwz;D|8EyaLFz&2nP8xEXtz-HiA;3(ib
z;Ah});2YpapbqdQ@GVdi_zYMFR0BQ+z5uEK?*pF#m4J7s#c&_dd^p1`rzm8&w<tz1
z+^ZD63^$KrEW<rdaggDjrtoCA$0*$CzV}Y!pSyGo7JuHQ@1n`SOBeH(|G7(_{&%O0
z-*?*Hy@tKn#$cyk@6titThkX_j>VsM>G|Dj*d1-K<IlVFr0y+gr;%9v^DeD|9sYBd
zw(EntbcZ^>gN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN
z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4
zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7
z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oq<Pz
zvw(|$hk!Of2Vfd-I?x`t2RH?24NL|a11*6GzzINeU>uMOoDAFuGyoa`BY}FrF~Bvz
z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-&
z1hfNw0g3=;09%1S*r4Yl;0Is_&<xnfwsbiB9UM55Sk&dz0r@*PaK-~)0QoyOa7F{4
z0&CfD;EVu10@eU^f%kw_KrLV?umY$Kybde}_5&7Ci{aj;Io+j8*apMRr=Ytu`-D$-
zY4(|(?$ReH=q`Pjg6`7k6m*wPp`g1o``}G?>8%ts3^#^??$S{dbeE2xpu05tbezv{
z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7
z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW<uymJJ
zp`g38A_d*0Whm$_EkWVlrG1{FOPlSlYaC&^D%pR=Wf?aqfsS1-D#PkqTGy>%tTxfV
ze(KWR$={FdFw2<!UD`+3g-y@vZ_-LO3;7!zARU=+8Cfj3(6NZFTO;<29XJnZC0(~8
z*p?1w9Iy<i3mgT!1=Io#2bKWUfjYnfU_YQH@DflNs0KU>ln1H+PXVQXO2A`4aiAP9
z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM
z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er
zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{
zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P?
zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi<P
z0lV06-~<7ifnR|uf$xByfnLBjz>h$8;7j0JpbPLBuny=5d<=X6oCmxQd<vWeyaRj$
zv;p1#-UCht76VIxQ-FCu{tgZtW8g&~e+LK71Yj<Zzk>sZ3p`0k7beX=(uGM8LKh~*
zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$
zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq
zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id
zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$
z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax
zZzyQAc|k#=O%(-=HV-Liw7E+`qs<KpcC;y?q>E#M1cd_U4BL|DTmf={*}(Ha1K?3$
z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<<Hy
zj0Sd!^WXa_;5y(JU;r>2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ
z&<@Do!GTi_v<C8baNyhqS_1hyIB>23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a)
zHbeOhEt=D4qeelaO<xKcZF*7AXd_KQqm4L4a+mf<JP&-8<zH4y*srO}dB?VNIG=%w
zfp35xf%Abcfp3AcfzPPLaO-FuMK^m2ABI~=A<S?eP<S!iI~23&qNb2xxJ49Z4EHhx
zJtCi__{MNgQYbOpqZG0X_aMawhP#)do#F1J2yatPqF}h2X?}@51RE*olTpK=P~e2J
zEqP8PunHImTm!5Ct^lqAmIFP30l?cpH=r-@8qgWI99Rfk1at*n208$pfaier)S~bG
z(VRz59TdIjse?k9o;oOK05GSZKA%JpLr)zPR`l3RK|_-s1r0db6qWSYOtG3Cn<*sd
zv6+GfLm7%(>de0MP)y}nD@LKq>B+WqIEuh9AO|Q53<e4TdjS1_-{={SMm93);ZAB$
z4|h;d54TWI55J?J9)3eXJ^X@#dbo;$dYFBVrXIdaK|Oqff_k`!f_nHe1@$oduuDD6
zKHf6iESj@De29|XLMQ#G2KDd`wk6L=1AYM}0`~x0ft!KJzz@Jpzyx3;a6K>%SPP5*
zZUoi<LxGXNDqtXR4X^^Zf?D)nd^D#X_NSm8_NJg7cBh~ocBY^nUPwVbJcojMcqRq)
z@N^35VRH)V;Yk$K!{aEZheuLS59?9nc4<c|P(9Qc`tKn6&zw-YEp};#(R@anD*Jq3
zI;%^Y&G=>P1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r
z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77
zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxl<T(8v*LJe;hYBYT35$v5
z&!sYJto^>GolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m
zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH
zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg
zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh-
z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE
z!wn<XzpMu=P4D!oZ5So-a`mx@$o7ui2c|q(|89+2>$&putk)7(^78$rj4R0bD^*h^
z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR|
zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl
zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aA<S0OA_
zAeX<U@T&G$U#)ds(`sC@x9IeETyk>q?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h)
zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p
zTU+#Wi>_BzeE-<Khf89$+LL>%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1
zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7
zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O
z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4!
zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfP<Q`#zpuB)w^s
zW67lf0f}3LJvHS&ROzYanOt+>m7h0UQ{3x<LFvjTD*JOQLR6kqyA1W_>b0lb4zMfT
zF<P@FZ0_Fn(^Yk@wVG4bUYW4ZTituU<u%d3((`#8hDUjE&$m{Wg-?`K2-8oU!g&_@
zqQ`&$(}=9rK24{t)TzpEcD6L!r5-2xW6Fiv<|bRa4K+PyN|$VYGVXl&sqrHmkKFBb
z%<n^H&edVAYlSbaTb~y{D6CV>u(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz?
zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi
zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-X<qJ0JA=7T
zxJd!#6D5j+KUzhIFB^D!&|9vdi{I8($wdho*=A?XRHwPct@tRu$mC;Y;TdPAD-ZMo
zVuz`fbp`3AKI*ddlxgz)cyg_|%O;<V`)q2SxUV1PI%dY{oGEATzRGLfzuI)e`Rtt&
zPn^1~92r0AMs@fq<<C_f7Q+rm512VBcJ-(`rLIGieX{o%Z?2cU_bTN{_B-<}+ipJ`
z<~`=I!I37B%g<!IEgp$RF0}NNS+k<wilVt1?&}t&E?(aHxOJv@qGht_#>S8`CBJcV
zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d
z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{z<KnLM?B)_VmpUg?j79&x*N
zT&o{AEqV9FvxV+OH$;1Bbj7u{AC5khHqcnI_4cAICDA43!m{F`t}iP@*KSH%b+~!C
z-^BFBz^;9ZrWN(S=3m&nyM{TV+rC?IlDK4WZMbHF@1nc0Q5$WUg!$V>UWiIL`JtCs
zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*<g6+0W%9NzJL~v1
zzE_u4=44YfgQI0e@~@8Dx`!$EcpAK5DDR=<@VB7{cXBj-Xvr^sko>emV(FNm=<S@s
zy=(29_U}q**}mQRn7H!y`fF29eHfwmpmFbzL2`{gcRvIb+^xJd>FdX;oaiaxVd<(x
z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU
z_RpjGAIy9AyzG>1SpV^<dt~1%S|Y|}jOv$_ZEnxh%JpwvH1PS7(w#3xEmOBj8muaD
zZRT<_)!Fe~L2nato)z~QmXQ(iUDy6en(x?(3HD9X-={mfN!`?PK5G`H>tiLGcHA&<
ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8
zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t!
z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA
zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G<tQe;q|6Q;fRpu&udRBo_WyA
zPG{=TMu)56bK|?lO;#GRFJ7W`rGKoXda$9a#JULq^F)rC+g!JjkT=oZu)5whb8Bn)
zu*AKQ*4AzBT_#>)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru
zei<w7IePBkEUmw92KC;y_TkO-@0Hc0nK9*$Mb2g~9Mrb%g|~%RZi@Rorl7LU;7q-k
z^1g|7J=I)x85F-iH#{p`?W@)6kjtYlN!(dAEhFzqNmW_3GcQKt;n~=Ar^_F_>qxRa
z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c
z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rf<B<
z%NxO(`?vdN&EM>yv0P>JlajyQFMDU4XKE7EwQjfd%#OoeS<x?VcKYpm{z+bV`eELa
z{T*TB8<Z}b&&+cQ>peOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5
zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsT<MLpl~v_A_EN4<&3&&po2(TR<sDMlJiw!U
zSzL^*O<Y>E-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg
zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs
zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E
ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8
z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ
zkHbLIj|b0Xy<aF(&}5jUz3KH!(;a6Hjvn%tfrp3c_=M2q_ev~w-*;E5+%j%iKerQI
zSHC`J;?4PVSIxqv)7A1tU*}}i%cpE+3T+Rz4y-hr`|3OA-o^=AS0@y@n>dQhjk@Qp
zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^<Y%df*y-uO@L@H#%k
ze#H9=LwAR*oH^K~rCDr9hRTV?WBuiin&{m$3=gpuma3K!eX-iZwXgK6u}g|Ocww2v
zyT*LIxLhP)j<W3CesA|?y*<AB>MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O|
zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV?
z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo
zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r
z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z
zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ
zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Yn<r!77D+U}|F!e!d
z<$0T`%{^^p9THQtgon>r>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD
zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6
zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR
ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_
zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7<Qd
zR5d^2o;f1Ni<kHpU%A%wII-#3+F47TF4X@u^2~xNvF344HTInRCYNl~OG4*mLf^Jr
zEn~lyO}1NoB6x!fRA1&F{wCsLH{g+5A9we%7!Pl~w$x6qcdbol^CgYWHdG{kP(Ii&
zaO}d&!b6wZiavZ=q&l`L)9_0}!)bl*5o0Eq-*0mJQuEO4!A%K8`_jTi>lt0{iNUoF
z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI
za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK<fmlg%@Jw4;L;ks7eXJVf@+@LvEKhAKg
zjt!VO%PI8uyz%BEzph@8Bc1;?NT_VK3n%+!UaXz41?T-2&$pL<3|}IB?c1m5x?{>!
z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX
z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm
zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ
z**<VWt*4o$x7OV)efEA!>Z<Xwe)MR_4ik+7krqlvEJm&!wxv|5w?U@0T>U6jNAaLx
zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+<g^cNyAQ
zGw-;4U+1_`((8I@veM|-(|a3AYK||z(y`mD^X@hoo0p@<mkm9dFC}U|ukG3aKZRJ^
z{Y?iQXAX!?sZf{d{bZ*2SW)Y-;Rh-tOAB~<*C($%<oIs3xJ9(J`yWpq_kRA~p6-0z
zosYY-?GOL{hX>Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp
z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;&
z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx
zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c
zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E
zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5<g}-SoCrrO*k0$o#_SnCV
zP023uCk>ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY
KhC61I(EkBO5z4Rt


From 047e41895a30fcc24c2fb0655e1c463d20e59e38 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:09:59 +0200
Subject: [PATCH 40/59] AEGIS-6405 add sample binary

For the purposes of test/test_binary_to_asm.py. To be placed in the malware_bin/ folder, as indicated in the unit test.
---
 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 | Bin 0 -> 33056 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data/5cca32eb8f9c2a024a57ce12e3fb66070662de80

diff --git a/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80
new file mode 100644
index 0000000000000000000000000000000000000000..208607f77c28082e1b391a5c7b16333894760e2d
GIT binary patch
literal 33056
zcmeI*d00*B8#nN#UD2#UXrQ8mGHgUNk(`vukSV*Ok|vs^v`FGqq#{$1PzWKJHOd%e
z2t_iKicnIS>b=+g<{X^g_5S}}?_Srn*LOXiHMI8qto^KYI;y*0n>r<ggoH(fgoH%s
zB~Pz1j*w84kOnOY38~P_l3qNXt?9I>3#Qu7WcBw+`1j`jY_Z3AOdIlev!~9R{re{T
z{Y`#1q3$-FiqOv&y9JLI;2ILZE@Arfc=`S4&Csf{0a(wr#k#lUhnNuaYm>+G4+wPe
z|A+oOUi>&5Pr7@HpY5MJNOm7jyc=*l-ZEFO6|R0{e;#j7_wls4w`bdI-T#^XbAC&F
zef<A@!+#zxzx#L}sT0|WZU6Ru!T-#8yg;8|51*wx4<9#Q(m#)vZH)KJ_BlJT?cdH1
z{LlQ~ew|`E+w>n6V*Bs6ex){l>Q450V_$6FvJ;Qz<Hhq|<?Z6@#aj{JNA3T8JV|WM
zKR&zv&xx)7ujk3*x%>WGHTL+wA5YQ*=lAn?Z2QmS{o3a7yqy1U=O^ENJofX$PHg+v
z{n_sMwO@F=rOpA)?3S4RJU<N+oZqq@SkJb9-5>hzoX2zV|Mx2X@B7_29mo6i7{Atw
z-&g-0?w8N~jOjL3Q>RS-`I%*K9F3nlP@A28wpsqUiuFN4LW}{sz+h~Vr|iqNyC)rn
z_N8UZ?qtu2UB80b?6vWjlD!tC=w(YUcKZ1>*krf=wa)DNU#EOVNa*Kh^H-Oph6cUZ
z&$WJ_zn{LBhl~EwRbES}%})L4#h%Br{B4(JsZP+}oq1w;;*ewKw8;2<+_NX0h?LOK
zpKVHZ(xG?C?#GThzfSDe8BZ^E9ed8gM$+{5g#Z28^z~=0@(*zJ9$_|u&cnlnHlm4!
zF@F8n{Y_j$T$cm}IJ<bc@>VzpEYlw`Lf^vA+1oYP*Uz)N^9R#*?D;KYgoM~@TerK-
z)>AMEDj=wUpaOyl2r3|`fS>|`3J5A7sDPjXf(i&KAgF+#0)h$%Dj=wUpaTCdRbY<g
zrrKGymYY6V#&FIJ7ZU2K>C(PV8N)d;f?C?e{N+?yj^XU!%PDp%?MpNtc_S)cE~&sW
zj&p@It?OOdrTmp|X#O`Rp1zmcrTvYT<#J^bsNp8(Q$ULwuz15w&gnWe1JNv}wVdVr
zWv4PXIp=a(HUHn8tbgChsCy0Vq&gBi712%wZgMV#wBvy8t!Ysni*$G=_K;me_~$`i
z+()&QzwC62cDzZeUSSox)gvrEp`9w_a<9|I?CELeVTT8_!+rWnWL|WtAC%kGgoHlH
zMJT7zdO3Mc1{)5XW>Pr?&Q9Qd;5%RvFa`JqxCOWa_!9UxFcJ6+7zNx6d<<lOn}GL$
ztAXo*cYr~_2;dFiN?<6k80ZBI1m*$Vfh&L)fG$8!;2EGJ&<&UkoCkCU9tF+<E&?6`
z+5jDZX~5|~d*B}66reRQ8E6c&1SS9{0L_7MKrV1Ha3jzFXb6l1>H)_9*8m3t^?|E^
z1A#+<0l)!3ZJ;l(4^RWR9M}ukALt5{0rmzu0VRP7z=c3jpfqqUu#^7p23?;LKs(?U
zpa^gVuod`&4F}FN;0Is_&<xlJ`~;i?tOdRYjtABN8-SyMRlwK45x@%IbD%D;99Rj|
z0^SBb1gZnC0n32>fQ7(YKxN=%U<ptjcn(+qlmeaxUIK~(j|0yFg@K2Gr-0v8`Gdd#
z;4xr3a4#?u*aF-IOb0drw*gau^}u+xrNiOx;J|ssqAn*E$lt+%Qw@v;@^^6HJOQo)
z#<1bQc>oLtZUEi|h5*+BZvy>+VZf_EA7C&rALv0XhU-W3M-11CZ7^IniiHf<i9(#=
zE~Hq?aOY55VYo9X&N1BS6zUAuoT8QCPNIlpxZ@~l816`l9EPh$v6$g%Q}{4kb&7n3
zt3uIESGgi3y)i5HBosItwk6Mz1zG@wfIWbwz;D|8EyaLFz&2nP8xEXtz-HiA;3(ib
z;Ah});2YpapbqdQ@GVdi_zYMFR0BQ+z5uEK?*pF#m4J7s#c&_dd^p1`rzm8&w<tz1
z+^ZD63^$KrEW<rdaggDjrtoCA$0*$CzV}Y!pSyGo7JuHQ@1n`SOBeH(|G7(_{&%O0
z-*?*Hy@tKn#$cyk@6titThkX_j>VsM>G|Dj*d1-K<IlVFr0y+gr;%9v^DeD|9sYBd
zw(EntbcZ^>gN11`_WB^D>*EvfI~xuh3*dWTJJ1x^0Bivo0bc`~fa8GAf%U*qz)IjN
z;BeqWU^P$&SO$Co)CArFJ^-o#OMrKQD!>BZO`sC+67VWe4tN%r59|p%1-uC40FME4
zfkMDc;7Q=O-u!x|1G9i_z*Jxcuo<`$xF7fqm;_7#z5#9l?f||7{tZk7J_AMpHv=C7
z8Q>=1ec)=~df**k5HJFG1Go|x3M>YC0Rw?~KzHB@;02%y&=YtD=m>NJW&`H|oq<Pz
zvw(|$hk!Of2Vfd-I?x`t2RH?24NL|a11*6GzzINeU>uMOoDAFuGyoa`BY}FrF~Bvz
z!9acBD&RoiP+$OX08ks~3+w~b04@ji0`>>G0%d@`flfe4paO6qP!uQ)oD1w^@8I-&
z1hfNw0g3=;09%1S*r4Yl;0Is_&<xnfwsbiB9UM55Sk&dz0r@*PaK-~)0QoyOa7F{4
z0&CfD;EVu10@eU^f%kw_KrLV?umY$Kybde}_5&7Ci{aj;Io+j8*apMRr=Ytu`-D$-
zY4(|(?$ReH=q`Pjg6`7k6m*wPp`g1o``}G?>8%ts3^#^??$S{dbeE2xpu05tbezv{
z{VCYHv^OQav4hGK3LGBWlIJV~?gK6WE&=Wa&H*k4ZU@=|=L5F_t$?$Eu|NypOkgz7
z6!;f#9nc6k6&Ma22Q&eO07n5Q0{wx*fn$L_Kpkq)Lk!L7F0DsFcWG@3x=XW<uymJJ
zp`g38A_d*0Whm$_EkWVlrG1{FOPlSlYaC&^D%pR=Wf?aqfsS1-D#PkqTGy>%tTxfV
ze(KWR$={FdFw2<!UD`+3g-y@vZ_-LO3;7!zARU=+8Cfj3(6NZFTO;<29XJnZC0(~8
z*p?1w9Iy<i3mgT!1=Io#2bKWUfjYnfU_YQH@DflNs0KU>ln1H+PXVQXO2A`4aiAP9
z6DSPq2}}omSK_}T4loti4ip0J1hxRbvEjf;0yY8LfLnm|)MB{JG=Ii$-%?O7*HXAM
z+!~7T>=g!_C$w;Kf`QCEO8VH*yo^3}6dd~4QEaA<9Yrp~&7s)EaF0`%GTcmx#|(Er
zMKHtNL!r-bw^Pu^E}lY+;r>lAk>ReVc+GIvP;6khArzwY6i#u5p28_K=_#CIJ3WO{
zykfWuDCo04hvGXug;Pk=Q#i#OdJ3mlMNi=r-t-hsVMw3Tk(5-6O!~)#^x@THTk@P?
zz;vJ*a0oCJr~(`W+zC_yssfXMa=^a8Ex?{YMd05+4p0^t1r!4I05ZUDiv05w1Fi<P
z0lV06-~<7ifnR|uf$xByfnLBjz>h$8;7j0JpbPLBuny=5d<=X6oCmxQd<vWeyaRj$
zv;p1#-UCht76VIxQ-FCu{tgZtW8g&~e+LK71Yj<Zzk>sZ3p`0k7beX=(uGM8LKh~*
zXSy&cIt^sfDd=`U^F#yA4t^tn=2{FlmZF^DZlJJXxD3Teh8s%pkl_YUB-6lhhyLj$
zmET>GLYI@zwsbf*fG)s`z+#{yFc+8yoCiD!ya1d9%mSVP+5j_v*}&<*{lKHZDZmuq
zA)qmE2QUpd0hkEf1LOiX1CxOUz)ipepdN5NFb+5v7y;Y}90&}h7Q>CCIgK{q6g1id
zQ_yJRN5LO$JZXU)ZI)6p+{HAf(Pkb6jW+faG}_FdpwY&Hf<_w?3L0%DP|#>IhJr?$
z5fn7q=upsTGl+sln*kIw+VrNN(MFzvMw^}#G}?$!&}h>s#~*DvC}^~4p`g*`9R-ax
zZzyQAc|k#=O%(-=HV-Liw7E+`qs<KpcC;y?q>E#M1cd_U4BL|DTmf={*}(Ha1K?3$
z4p0wx2zUZG7?=h;0vrh313U;E089q%1NH$X0Cxj>0poz%fil32z^y<@U?eaWC<<Hy
zj0Sd!^WXa_;5y(JU;r>2*b4Loh5$bRmjnHQjX+nR53m;K1oQyb02cz60;_;?fjnRZ
z&<@Do!GTi_v<C8baNyhqS_1hyIB>23%?atkq&bZ?78Eqvm{8DYGl7Ce8!iRI)u%a)
zHbeOhEt=D4qeelaO<xKcZF*7AXd_KQqm4L4a+mf<JP&-8<zH4y*srO}dB?VNIG=%w
zfp35xf%Abcfp3AcfzPPLaO-FuMK^m2ABI~=A<S?eP<S!iI~23&qNb2xxJ49Z4EHhx
zJtCi__{MNgQYbOpqZG0X_aMawhP#)do#F1J2yatPqF}h2X?}@51RE*olTpK=P~e2J
zEqP8PunHImTm!5Ct^lqAmIFP30l?cpH=r-@8qgWI99Rfk1at*n208$pfaier)S~bG
z(VRz59TdIjse?k9o;oOK05GSZKA%JpLr)zPR`l3RK|_-s1r0db6qWSYOtG3Cn<*sd
zv6+GfLm7%(>de0MP)y}nD@LKq>B+WqIEuh9AO|Q53<e4TdjS1_-{={SMm93);ZAB$
z4|h;d54TWI55J?J9)3eXJ^X@#dbo;$dYFBVrXIdaK|Oqff_k`!f_nHe1@$oduuDD6
zKHf6iESj@De29|XLMQ#G2KDd`wk6L=1AYM}0`~x0ft!KJzz@Jpzyx3;a6K>%SPP5*
zZUoi<LxGXNDqtXR4X^^Zf?D)nd^D#X_NSm8_NJg7cBh~ocBY^nUPwVbJcojMcqRq)
z@N^35VRH)V;Yk$K!{aEZheuLS59?9nc4<c|P(9Qc`tKn6&zw-YEp};#(R@anD*Jq3
zI;%^Y&G=>P1`TyxU0uuQ$`ERJN=prQ8l@T>)AknXh|SAAf3<$}jtiswr6+e~NUR+r
z*7U5;68%}%KYKh-l7H=PJV!Nf*4^a>3bqx_fyagf1Ui|$l8Z4eX@9WIyWsjw(O&77
zhj_et@L<3ruGrTTOO8A(vS09d%}D0jm-OtNBd5fi^hpxl<T(8v*LJe;hYBYT35$v5
z&!sYJto^>GolLJe=^7DI-{aHwRRzz-IqZJ3$84F?^Dk2ys$R4+^2xU&R^H&rhDP*m
zv97yQYgeYWGRZiu_w~8aOEOeLqjonRyl|k?eD%jEm+s$hxp(l=*2>F%n+Lc1guZaH
zOWH7M{J?=n4oA<4G2Z3AqVrYcagEVBXNt65Xw45d+#hJ+>6CBi@pn#GWlQY^wL?lg
zY3XBmr{C*ko0Oa~_WE#NHd6VGl)aSaUv1mR$H;E7uGH65j6K#dXi&xV0Z-pXzdoh-
z@ML_5{je(C?1B`z-shaFZrvzXQVyB>`ccKXQGRcH7JbuuyV`xjyaRJRgokb`9yVuE
z!wn<XzpMu=P4D!oZ5So-a`mx@$o7ui2c|q(|89+2>$&putk)7(^78$rj4R0bD^*h^
z&C*u(e#rN5iAdpB@5P4H6h9Cz?ltskyUeAFIgV3Tr1h2Eb>2eOvt;{>)SEfOCjYR|
zTdM1oHneoAjB`P3jlQDX+_$~~)?PjQCih5+?=^97jEKd>?6;rV756YnMy{(HR~mhl
zHSWn%_3gi8f7W~Fx+6EU&v#wl=#(?)=2QPS4h13dheDDxRg8v)TJ)RN<{aA<S0OA_
zAeX<U@T&G$U#)ds(`sC@x9IeETyk>q?1K{O^;a)yiq(mnzdp2O%|y*=@!%_m?s)h)
zy6T<{sjKQ8krrLmcB|Mo=J~8hHE)AUDXQ|@hgDC_G0-3TZRCBc-XrcOWT>Rp=!H7p
zTU+#Wi>_BzeE-<Khf89$+LL>%e(}S8fUi=)^VWjH6%!|{PxI5&-(~H*YrIUyny1x1
zE`GTp#wu@|{C1Rvd^|n3v+t+l)4cTu4XWQiDtw6i3U}9QYa?YRZD|oMP@i1sEk1M7
zk-Wsgf6xDGp-Py;i(9#?ik-I1Z;e#a*giFOz(mVC4|yM?#U}PQsd(aU|1faNwGs#O
z$4=rIDm<%jO}$Spnni8XAMc)LWWUsCSgTLu*TP%3I*u7RsYE57x)dE|v(HM^XF=4!
zv==pxTFlOVZW%aD(#5f)-`@5|Q+3un4Bc1Q%SLJg??Tjqc;31VsfP<Q`#zpuB)w^s
zW67lf0f}3LJvHS&ROzYanOt+>m7h0UQ{3x<LFvjTD*JOQLR6kqyA1W_>b0lb4zMfT
zF<P@FZ0_Fn(^Yk@wVG4bUYW4ZTituU<u%d3((`#8hDUjE&$m{Wg-?`K2-8oU!g&_@
zqQ`&$(}=9rK24{t)TzpEcD6L!r5-2xW6Fiv<|bRa4K+PyN|$VYGVXl&sqrHmkKFBb
z%<n^H&edVAYlSbaTb~y{D6CV>u(nR&v9GP8&-xEnPdO^p)y(h-)m>6)S2v>W*bOz?
zptGEy>|Im7u5jO&Q`hQdns)I^=B57ECziL|yWRZi-ne;#YqCz=D(=#I6EAJ}y|0Pi
zwvovj{OyPQ2wj$ar6YxVp)*u&&Db}e=WEJHW?1!o)6l;;Br>FBF?VF-X<qJ0JA=7T
zxJd!#6D5j+KUzhIFB^D!&|9vdi{I8($wdho*=A?XRHwPct@tRu$mC;Y;TdPAD-ZMo
zVuz`fbp`3AKI*ddlxgz)cyg_|%O;<V`)q2SxUV1PI%dY{oGEATzRGLfzuI)e`Rtt&
zPn^1~92r0AMs@fq<<C_f7Q+rm512VBcJ-(`rLIGieX{o%Z?2cU_bTN{_B-<}+ipJ`
z<~`=I!I37B%g<!IEgp$RF0}NNS+k<wilVt1?&}t&E?(aHxOJv@qGht_#>S8`CBJcV
zhw5bb@9{ZU73{0{_ba363zw7`$JX>UJ-_Gs-S72Vaz02a3EyGHDQ*kS^WN5{Q|!&d
z1kXk_-B86d&;GjH;1r~B_h`U*Q6)X|Ebqdm(k+8bv{z<KnLM?B)_VmpUg?j79&x*N
zT&o{AEqV9FvxV+OH$;1Bbj7u{AC5khHqcnI_4cAICDA43!m{F`t}iP@*KSH%b+~!C
z-^BFBz^;9ZrWN(S=3m&nyM{TV+rC?IlDK4WZMbHF@1nc0Q5$WUg!$V>UWiIL`JtCs
zxxvQ-y@?WLA0`$BIBXteXSV*umts%JsyY2uJ`-u0?KQAkX{*0*<g6+0W%9NzJL~v1
zzE_u4=44YfgQI0e@~@8Dx`!$EcpAK5DDR=<@VB7{cXBj-Xvr^sko>emV(FNm=<S@s
zy=(29_U}q**}mQRn7H!y`fF29eHfwmpmFbzL2`{gcRvIb+^xJd>FdX;oaiaxVd<(x
z7Yc`e85-kwYgqDu+t$Zsl%{^S+|{pWzo*?2udMG6#m0Vht?R=XH7?~rwbq`SY7vcU
z_RpjGAIy9AyzG>1SpV^<dt~1%S|Y|}jOv$_ZEnxh%JpwvH1PS7(w#3xEmOBj8muaD
zZRT<_)!Fe~L2nato)z~QmXQ(iUDy6en(x?(3HD9X-={mfN!`?PK5G`H>tiLGcHA&<
ztm)9#t3?-2x^S5{ezE#=pMF6Tho-3R71i4QS*>%BtN5k2pGT`?OWG|yasBDEs{^<8
zPHE!~9y+73;oXV_^>;4cI~aRQsx0p9s>s$3uOYFD{mw0Zo*iQ76}&C|=AwX?hW9t!
z)^-kjUH`=^BKX)~SC!=vDr0UQ49hw;+UA0MO$GaH(e58@|1H^{DP@1=bPOf?+B%MA
zG_lL{Fhf7*Dl8<=|E4PYtyN+6F#mJPqW##Fx!sv^G<tQe;q|6Q;fRpu&udRBo_WyA
zPG{=TMu)56bK|?lO;#GRFJ7W`rGKoXda$9a#JULq^F)rC+g!JjkT=oZu)5whb8Bn)
zu*AKQ*4AzBT_#>)w%SE6H#z-d-t99T`p#SX%z2)k8ng4I!l2hhHZOf@b+pbZ8``Ru
zei<w7IePBkEUmw92KC;y_TkO-@0Hc0nK9*$Mb2g~9Mrb%g|~%RZi@Rorl7LU;7q-k
z^1g|7J=I)x85F-iH#{p`?W@)6kjtYlN!(dAEhFzqNmW_3GcQKt;n~=Ar^_F_>qxRa
z6aL6tsK6}n!^?$>7K#4dn&w|`+p3sj-pkcOdH%WO$CIv1xTI!9|Kw!*tKfXQj>o5c
z6FhvRB&|8Iap`;e+b9+uHBB@X?{8JwS4FK?mV!xh(yS9}ledN44P6ltUQlr@rf<B<
z%NxO(`?vdN&EM>yv0P>JlajyQFMDU4XKE7EwQjfd%#OoeS<x?VcKYpm{z+bV`eELa
z{T*TB8<Z}b&&+cQ>peOx?R15O@X1AMs(PH8tXI_kW01D}>Y9U#$IMFEXD=i>Y#FC5
zOCvYM#3yz@oMHL7GZ*XoY;2B`me_UsT<MLpl~v_A_EN4<&3&&po2(TR<sDMlJiw!U
zSzL^*O<Y>E-@%9lwWlpVbR;QAPpY3W;^=NSqd5&Ny7je{^X<2W&*fQjt_EJx(6+pg
zwaqbNRKVS7qS`%H_i^^E>xx)9>x+-?&X`>r5}a!<=k`^T-Jz;pW-k;RK3K|6;>XQs
zxpm23W1h!f$drF*d-UY2&4w=)c|MY9wsTVC3|wv0?}=%1al`q6E*bi}7uRTAG380E
ze$CNcubyy1S+}9?9Y>?oW##pKkF7jQG*V74NR(b+ldd)6cF~0aLZcVYtiGlB@Tpq8
z(A0GOMVV*jRec-N^I_xqr~@Ze>o2TKH&>YP=CbjarbT(<9y;19RY`yHaGiFxHp;MQ
zkHbLIj|b0Xy<aF(&}5jUz3KH!(;a6Hjvn%tfrp3c_=M2q_ev~w-*;E5+%j%iKerQI
zSHC`J;?4PVSIxqv)7A1tU*}}i%cpE+3T+Rz4y-hr`|3OA-o^=AS0@y@n>dQhjk@Qp
zV&h^x)%4Qo&SB!OQe5`=tF@~SHeOWy=(yPNL6gUNXpY`^{?^<Y%df*y-uO@L@H#%k
ze#H9=LwAR*oH^K~rCDr9hRTV?WBuiin&{m$3=gpuma3K!eX-iZwXgK6u}g|Ocww2v
zyT*LIxLhP)j<W3CesA|?y*<AB>MWCqY4wHAJ}He}w<1JDw?_E>`lS8Lu;k(!(?*O|
zcRL->8I?2KR5IIs)A><@mex+LI2img;#KR9wo^B}vX{QlX|dFJ9lfIe$M@b=vMzV?
z=6Td;9hZ0cl%imF()qgjiN0sYmOMYuJ0$0#ndEevzoI%5#SG4Wv=Oda?47+X|L_fo
zSZmME&n+1f_dcs!(=;6#3akHK(5nCL=$G^s$KKW>7faP@1Rd6na`Nf1#56MAZ+A*r
z;>xh&vx3x*8Xk2^9rp0*{Tq7R&q;IiTys8slP+$&Z@E~c^0?d81ts^B4x5YDc<(;Z
zwzt_{IxW;TxoY%xC1;Hpd67rTjY3LFC)@O&uNXBjdco%E*0Fx}x0j0NU)4wy>33xJ
zlWI$i@>^wzWdTPQT)x+JrF`A~X%chSKCW!5tFqlt6Xq7XYD~Yn<r!77D+U}|F!e!d
z<$0T`%{^^p9THQtgon>r>m9y+Z9w^g4AI09Q++<&)2qJiFQxa=;YYs#R&Lqqk6pwD
zi`kqok5*QaxBs|4MD(R;@z9OQUn?Cx+TU6}T61gh%N>d)PcNG)cg`{0mnvf3)cDA6
zW>Bm;^S*gd?-z<0uLf>DCek>scYon~kEC8V-Dav{Oolg9mVfTQF8^I{W9O`k8H0KR
ztUWz@spNB?z=5+mMZB+MDa>@^GC$0+?$2qt^X*H*_3=&L_vb8D^31fWj*8g(!`8~_
zl+)WW%ckpcdiksE*P1`RWkTcFJ{7Mv1uS_xyj86+vUkU%9(~4`TXcy2=vP`d$7<Qd
zR5d^2o;f1Ni<kHpU%A%wII-#3+F47TF4X@u^2~xNvF344HTInRCYNl~OG4*mLf^Jr
zEn~lyO}1NoB6x!fRA1&F{wCsLH{g+5A9we%7!Pl~w$x6qcdbol^CgYWHdG{kP(Ii&
zaO}d&!b6wZiavZ=q&l`L)9_0}!)bl*5o0Eq-*0mJQuEO4!A%K8`_jTi>lt0{iNUoF
z=kx0dFJ3iEZI4jZ8|iRO*P=#*o7U8D#OFkt(JQqFADf@u)tJyOe5=ijH^J$aVutGI
za`Q2shf<>UiFnloPo96?bCZ;$Mf=zz{oK<fmlg%@Jw4;L;ks7eXJVf@+@LvEKhAKg
zjt!VO%PI8uyz%BEzph@8Bc1;?NT_VK3n%+!UaXz41?T-2&$pL<3|}IB?c1m5x?{>!
z6UUWSHmr<)5NO$Y{o*3~Rmn3xmD*ppn3bzud2B;M+7JbmLjT;GU##L48uc^%-&{SX
z9q-eSyJJXGz=_hHTyvX^(ffiHL{BT>4xH|EZ^O3Lk0WP@Tpzb0{Y%J=%D9sU3q4gm
zEekr_w>g_?sD8HJ)bDg%YPIQhyHDRH1T3k(^TcycXJJZMuS-cOh9)O_U*6A{^);FJ
z**<VWt*4o$x7OV)efEA!>Z<Xwe)MR_4ik+7krqlvEJm&!wxv|5w?U@0T>U6jNAaLx
zQ5zNxn5VWNO3rhQdTPmJ%Mm7qaz1Jnk~zk&BBp(fkBE0rp7bO!-qzxtv+<g^cNyAQ
zGw-;4U+1_`((8I@veM|-(|a3AYK||z(y`mD^X@hoo0p@<mkm9dFC}U|ukG3aKZRJ^
z{Y?iQXAX!?sZf{d{bZ*2SW)Y-;Rh-tOAB~<*C($%<oIs3xJ9(J`yWpq_kRA~p6-0z
zosYY-?GOL{hX>Oy;$WX!Wat-nu=%XJZKeNvTd{k6-e`Cl7x}ZT(!E~0J8uob`k(pp
z>x6e7Ke0Pc{Lw!1hnIA>uXN{w?)>S0lta)IR6tMxK?MXA5L7@=0YL=>6%bTFPys;&
z1QifeKu`fe1q2lkR6tMxK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMx
zK?MXA5L7@=0YL=>6%bTFPys;&1QifeKu`fe1q2lkR6tMxK?MXA`2V~D3WMbB4eI5c
zG`j1cV8Z?pY@UBWpbO81{!Tpo5qmp7$#@|@c?A=mpR2cTkgE{2`cliE|L3Zy#h-+E
zJnv;Z{Do>)-ZEEbcGo`auO;A5`Sf#k>|{tUBYK%lb*A5<g}-SoCrrO*k0$o#_SnCV
zP023uCk>ka+Sd7_&3*y)_Z!e;Lcicj=$AJA&G-MF(a-nr^X&igZvC8>`c2T+=Z+mY
KhC61I(EkBO5z4Rt

literal 0
HcmV?d00001


From d4af35d6b44547ab7de221b01f3808d02417b925 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:10:50 +0200
Subject: [PATCH 41/59] AEGIS-6405 fix path

---
 test/test_binary_to_asm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_binary_to_asm.py b/test/test_binary_to_asm.py
index f042166..c9cbc10 100644
--- a/test/test_binary_to_asm.py
+++ b/test/test_binary_to_asm.py
@@ -121,7 +121,7 @@ def test_fn_to_asm_returns_expected_asm(self):
         self.assertEqual(_fn_to_asm(self.pdf_dict, asm_min), expected_asm)
 
     def test_bin_to_asm_returns_expected_number_of_disassembled_files(self):
-        binary_location = "malware/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
+        binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
         asm_minlen = 5
         magic_bytes = ['cffaedfe']
         self.assertEqual(bin_to_asm(Path(binary_location), Path(self.output_path), asm_minlen, magic_bytes), 1)

From fb3b5070ec62b64cae7ca74c3812d5fbb49669c9 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Tue, 3 Oct 2023 20:13:29 +0200
Subject: [PATCH 42/59] AEGIS-6405 Delete data/sample_binary

---
 data/sample_binary | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 data/sample_binary

diff --git a/data/sample_binary b/data/sample_binary
deleted file mode 100644
index 8b13789..0000000
--- a/data/sample_binary
+++ /dev/null
@@ -1 +0,0 @@
-

From df44a2f72d8311c3bb0a1f28aff91ff3768ae97c Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Wed, 4 Oct 2023 14:44:33 +0100
Subject: [PATCH 43/59] AEGIS-6405 - test fix

---
 asm2vec/__init__.py        |   7 +-
 test/test_binary_to_asm.py | 224 ++++++++++++++++++++++++++-----------
 2 files changed, 162 insertions(+), 69 deletions(-)

diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py
index 2f3c046..2d9cfd9 100644
--- a/asm2vec/__init__.py
+++ b/asm2vec/__init__.py
@@ -1 +1,6 @@
-__all__ = ["datatype", "model", "binary_to_asm", "train", "tensors", "version"]
+import os
+
+__home__ = os.path.dirname(os.path.abspath(__path__[0]))
+__data__ = os.path.join(__home__, "data")
+
+__all__ = ["__data__", "__home__", "binary_to_asm", "datatype", "model", "tensors", "train", "utils", "version"]
diff --git a/test/test_binary_to_asm.py b/test/test_binary_to_asm.py
index c9cbc10..ce53411 100644
--- a/test/test_binary_to_asm.py
+++ b/test/test_binary_to_asm.py
@@ -1,62 +1,154 @@
+from os import path, mkdir
 from pathlib import Path
+from shutil import rmtree
 from unittest import TestCase
-from asm2vec.binary_to_asm import (bin_to_asm,
-                                   convert_to_asm,
-                                   _fn_to_asm,
-                                   _normalize,
-                                   _sha3,
-                                   _valid_exe)
+
+from asm2vec import __data__
+from asm2vec.binary_to_asm import (bin_to_asm, convert_to_asm, _fn_to_asm, _normalize, _sha3, _valid_exe)
 
 
 class TestBinaryToAsm(TestCase):
 
     @classmethod
-    def setUpClass(cls):
+    def setUpClass(cls) -> None:
         print("\n--- TestBinaryToAsm ---")
-        cls.output_path = 'malware_asm/'
-        cls.pdf_dict = {'name': 'main', 'size': 18, 'addr': 4294974144,
-                        'ops': [{'offset': 4294974144, 'esil': 'rbp,8,rsp,-,=[8],8,rsp,-=', 'refptr': 0,
-                                 'fcn_addr': 4294974144, 'fcn_last': 4294974161, 'size': 1, 'opcode': 'push rbp',
-                                 'disasm': 'push rbp', 'bytes': '55', 'family': 'cpu', 'type': 'rpush',
-                                 'reloc': 'False', 'type_num': 268435468, 'type2_num': 0,
-                                 'flags': ['main', 'entry0', 'section.0.__TEXT.__text', 'sym.func.100001ac0', 'rip'],
-                                 'comment': 'WzAwXSAtci14IHNlY3Rpb24gc2l6ZSA3Mzc2IG5hbWVkIDAuX19URVhULl9fdGV4dA=='},
-                                {'offset': 4294974145, 'esil': 'rsp,rbp,=', 'refptr': 0, 'fcn_addr': 4294974144,
-                                 'fcn_last': 4294974159, 'size': 3, 'opcode': 'mov rbp, rsp', 'disasm': 'mov rbp, rsp',
-                                 'bytes': '4889e5', 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9,
-                                 'type2_num': 0}, {'offset': 4294974148, 'esil': 'rbx,8,rsp,-,=[8],8,rsp,-=',
-                                                   'refptr': 0, 'fcn_addr': 4294974144, 'fcn_last': 4294974161,
-                                                   'size': 1, 'opcode': 'push rbx', 'disasm': 'push rbx', 'bytes': '53',
-                                                   'family': 'cpu', 'type': 'rpush', 'reloc': 'False',
-                                                   'type_num': 268435468, 'type2_num': 0},
-                                {'offset': 4294974149, 'esil': 'rax,8,rsp,-,=[8],8,rsp,-=', 'refptr': 0,
-                                 'fcn_addr': 4294974144, 'fcn_last': 4294974161, 'size': 1, 'opcode': 'push rax',
-                                 'disasm': 'push rax', 'bytes': '50', 'family': 'cpu', 'type': 'rpush',
-                                 'reloc': 'False', 'type_num': 268435468, 'type2_num': 0},
-                                {'offset': 4294974150, 'esil': 'rsi,rbx,=', 'refptr': 0, 'fcn_addr': 4294974144,
-                                 'fcn_last': 4294974159, 'size': 3, 'opcode': 'mov rbx, rsi', 'disasm': 'mov rbx, rsi',
-                                 'bytes': '4889f3', 'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9,
-                                 'type2_num': 0}, {'offset': 4294974153, 'ptr': 4294985864,
-                                                   'esil': '0x2db8,rip,+,[8],rax,=', 'refptr': 8,
-                                                   'fcn_addr': 4294974144, 'fcn_last': 4294974155, 'size': 7,
-                                                   'opcode': 'mov rax, qword [rip + 0x2db8]',
-                                                   'disasm': 'mov rax, qword [0x100004888]', 'bytes': '488b05b82d0000',
-                                                   'family': 'cpu', 'type': 'mov', 'reloc': 'False', 'type_num': 9,
-                                                   'type2_num': 0, 'refs': [{'addr': 4294985864, 'type': 'DATA',
-                                                                             'perm': 'r--'}]}, {'offset': 4294974160,
-                                                                                                'esil': 'rax,rip,=',
-                                                                                                'refptr': 0,
-                                                                                                'fcn_addr': 4294974144,
-                                                                                                'fcn_last': 4294974160,
-                                                                                                'size': 2,
-                                                                                                'opcode': 'jmp rax',
-                                                                                                'disasm': 'jmp rax',
-                                                                                                'bytes': 'ffe0',
-                                                                                                'family': 'cpu',
-                                                                                                'type': 'rjmp',
-                                                                                                'reloc': 'False',
-                                                                                                'type_num': 268435458,
-                                                                                                'type2_num': 0}]}
+        cls.output_path = "malware_asm/"
+        cls.data_path = path.join(__data__, "5cca32eb8f9c2a024a57ce12e3fb66070662de80")
+        cls.pdf_dict = {
+            'name': 'main',
+            'size': 18,
+            'addr': 4294974144,
+            'ops': [
+                {
+                    'offset': 4294974144,
+                    'esil': 'rbp,8,rsp,-,=[8],8,rsp,-=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974161,
+                    'size': 1,
+                    'opcode': 'push rbp',
+                    'disasm': 'push rbp',
+                    'bytes': '55',
+                    'family': 'cpu',
+                    'type': 'rpush',
+                    'reloc': 'False',
+                    'type_num': 268435468,
+                    'type2_num': 0,
+                    'flags': ['main', 'entry0', 'section.0.__TEXT.__text', 'sym.func.100001ac0', 'rip'],
+                    'comment': 'WzAwXSAtci14IHNlY3Rpb24gc2l6ZSA3Mzc2IG5hbWVkIDAuX19URVhULl9fdGV4dA=='
+                },
+                {
+                    'offset': 4294974145,
+                    'esil': 'rsp,rbp,=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974159,
+                    'size': 3,
+                    'opcode': 'mov rbp, rsp',
+                    'disasm': 'mov rbp, rsp',
+                    'bytes': '4889e5',
+                    'family': 'cpu',
+                    'type': 'mov',
+                    'reloc': 'False',
+                    'type_num': 9,
+                    'type2_num': 0
+                },
+                {
+                    'offset': 4294974148,
+                    'esil': 'rbx,8,rsp,-,=[8],8,rsp,-=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974161,
+                    'size': 1,
+                    'opcode': 'push rbx',
+                    'disasm': 'push rbx',
+                    'bytes': '53',
+                    'family': 'cpu',
+                    'type': 'rpush',
+                    'reloc': 'False',
+                    'type_num': 268435468,
+                    'type2_num': 0
+                },
+                {
+                    'offset': 4294974149,
+                    'esil': 'rax,8,rsp,-,=[8],8,rsp,-=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974161,
+                    'size': 1,
+                    'opcode': 'push rax',
+                    'disasm': 'push rax',
+                    'bytes': '50',
+                    'family': 'cpu',
+                    'type': 'rpush',
+                    'reloc': 'False',
+                    'type_num': 268435468,
+                    'type2_num': 0
+                },
+                {
+                    'offset': 4294974150,
+                    'esil': 'rsi,rbx,=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974159,
+                    'size': 3,
+                    'opcode': 'mov rbx, rsi',
+                    'disasm': 'mov rbx, rsi',
+                    'bytes': '4889f3',
+                    'family': 'cpu',
+                    'type': 'mov',
+                    'reloc': 'False',
+                    'type_num': 9,
+                    'type2_num': 0
+                },
+                {
+                    'offset': 4294974153,
+                    'ptr': 4294985864,
+                    'esil': '0x2db8,rip,+,[8],rax,=',
+                    'refptr': 8,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974155,
+                    'size': 7,
+                    'opcode': 'mov rax, qword [rip + 0x2db8]',
+                    'disasm': 'mov rax, qword [0x100004888]',
+                    'bytes': '488b05b82d0000',
+                    'family': 'cpu',
+                    'type': 'mov',
+                    'reloc': 'False',
+                    'type_num': 9,
+                    'type2_num': 0,
+                    'refs': [
+                        {
+                            'addr': 4294985864,
+                            'type': 'DATA',
+                            'perm': 'r--'
+                        }
+                    ]
+                },
+                {
+                    'offset': 4294974160,
+                    'esil': 'rax,rip,=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974160,
+                    'size': 2,
+                    'opcode': 'jmp rax',
+                    'disasm': 'jmp rax',
+                    'bytes': 'ffe0',
+                    'family': 'cpu',
+                    'type': 'rjmp',
+                    'reloc': 'False',
+                    'type_num': 268435458,
+                    'type2_num': 0
+                }
+            ]
+        }
+        mkdir(cls.output_path)
+
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        rmtree(cls.output_path)
 
     def test_sha3(self):
         """Should return 64-character long string"""
@@ -71,17 +163,13 @@ def test_sha3(self):
 
     def test_valid_exe_when_valid_magic_bytes(self):
         """Should return boolean"""
-        binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
-        filename = Path(binary_location)
-        magic_bytes = ['cffaedfe']
-        self.assertEqual(_valid_exe(filename, magic_bytes), True)
+        magic_bytes = ["cffaedfe"]
+        self.assertEqual(_valid_exe(self.data_path, magic_bytes), True)
 
     def test_valid_exe_when_not_valid_magic_bytes(self):
         """Should return boolean"""
-        binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
-        filename = Path(binary_location)
-        magic_bytes = ['cafebabe']
-        self.assertEqual(_valid_exe(filename, magic_bytes), False)
+        magic_bytes = ["cafebabe"]
+        self.assertEqual(_valid_exe(self.data_path, magic_bytes), False)
 
     def test_normalize_when_offset(self):
         """Should return normalized opcode"""
@@ -91,7 +179,7 @@ def test_normalize_when_offset(self):
 
     def test_normalize_when_no_offset(self):
         """Should return normalized opcode"""
-        opcode = 'mov rbx, rsi'
+        opcode = "mov rbx, rsi"
         expected_norm_opcode = "mov rbx, rsi"
         self.assertEqual(_normalize(opcode), expected_norm_opcode)
 
@@ -121,21 +209,21 @@ def test_fn_to_asm_returns_expected_asm(self):
         self.assertEqual(_fn_to_asm(self.pdf_dict, asm_min), expected_asm)
 
     def test_bin_to_asm_returns_expected_number_of_disassembled_files(self):
-        binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
         asm_minlen = 5
-        magic_bytes = ['cffaedfe']
-        self.assertEqual(bin_to_asm(Path(binary_location), Path(self.output_path), asm_minlen, magic_bytes), 1)
+        magic_bytes = ["cffaedfe"]
+        self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 1)
 
     def test_bin_to_asm_returns_expected_number_of_disassembled_files_when_pdfops_shorter_than_minlen(self):
-        binary_location = "malware_bin/5cca32eb8f9c2a024a57ce12e3fb66070662de80"
         asm_minlen = 10
         magic_bytes = ['cffaedfe']
-        self.assertEqual(bin_to_asm(Path(binary_location), self.output_path, asm_minlen, magic_bytes), 0)
+        self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 0)
 
     def test_convert_to_asm_returns_expected_sha1(self):
-        input_path = 'malware_bin/'
+        input_path = __data__
         asm_minlen_upper = 10
         asm_minlen_lower = 5
         expected_sha1 = ["5cca32eb8f9c2a024a57ce12e3fb66070662de80"]
-        self.assertEqual(convert_to_asm(input_path, self.output_path, asm_minlen_upper, asm_minlen_lower),
-                         expected_sha1)
+        self.assertEqual(
+            convert_to_asm(input_path, self.output_path, asm_minlen_upper, asm_minlen_lower),
+            expected_sha1
+        )

From 7beb70579e85e778ba18606f82e85a64812594b6 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Wed, 4 Oct 2023 14:56:27 +0100
Subject: [PATCH 44/59] AEGIS-6405 -  r2env

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index d92495b..c846480 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 torch>=1.7,<2
 click>=7.1,<8
 r2pipe>=1.5,<2
+r2env>=0.5.7,<1

From ba5f4086bc26424784209a2ec26a77e903e16294 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Wed, 4 Oct 2023 16:06:39 +0100
Subject: [PATCH 45/59] AEGIS-6405 - radar2 install

---
 setup.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index be492bc..917afbb 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,25 @@
+import os
 from setuptools import setup, find_packages
+from setuptools.command.install import install as _install
 
 from asm2vec.version import VERSION
 
 
+class install(_install):
+    @staticmethod
+    def _setup_radare2() -> None:
+        if os.system('r2env shell "r2 -v"') == 0:
+            print("radar2 already set up!")
+            return
+        os.system("r2env init")
+        os.system("r2env add radare2")
+        os.system("r2env use radare2@git")
+
+    def run(self):
+        _install.run(self)
+        self._setup_radare2()
+
+
 def readme():
     with open('README.md') as f:
         return f.read()
@@ -22,9 +39,10 @@ def read_requirements():
     author_email='jamie.nutter@jamf.com',
     license='MIT License',
     install_requires=read_requirements(),
-    packages = find_packages(),
+    packages=find_packages(),
     zip_safe=False,
     include_package_data=True,
     test_suite='nose.collector',
-    tests_require=['nose']
+    tests_require=['nose'],
+    cmdclass={'install': install}
 )

From 9775f0cd12110c96c2766f3cbb41597f84eb8fde Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Wed, 4 Oct 2023 16:27:14 +0100
Subject: [PATCH 46/59] AEGIS-6405 - radar2 test

---
 setup.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 917afbb..a6e9aea 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
 import os
+import sys
 from setuptools import setup, find_packages
 from setuptools.command.install import install as _install
 
@@ -8,12 +9,12 @@
 class install(_install):
     @staticmethod
     def _setup_radare2() -> None:
-        if os.system('r2env shell "r2 -v"') == 0:
-            print("radar2 already set up!")
-            return
-        os.system("r2env init")
-        os.system("r2env add radare2")
-        os.system("r2env use radare2@git")
+        if sys.platform.startswith("linux"):
+            os.system("apt-get install radare2")
+        elif sys.platform.startswith("darwin"):
+            os.system("brew install radare2")
+        else:
+            print("Ensure 'radar2' is installed...")
 
     def run(self):
         _install.run(self)

From e8735feaa257b45996e3bb6d9871db6bbdae8fb5 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Wed, 4 Oct 2023 17:16:42 +0100
Subject: [PATCH 47/59] AEGIS-6405 - radare2 test 2

---
 asm2vec/version.py |  2 ++
 setup.py           | 13 ++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/asm2vec/version.py b/asm2vec/version.py
index d6f3f4b..500db07 100644
--- a/asm2vec/version.py
+++ b/asm2vec/version.py
@@ -1,2 +1,4 @@
 VERSION = '1.0.2'
 DEV_VERSION = '0'
+
+radare2_version = "5.8.8"
diff --git a/setup.py b/setup.py
index a6e9aea..e594ee2 100644
--- a/setup.py
+++ b/setup.py
@@ -3,22 +3,29 @@
 from setuptools import setup, find_packages
 from setuptools.command.install import install as _install
 
-from asm2vec.version import VERSION
+from asm2vec.version import VERSION, radare2_version
 
 
 class install(_install):
     @staticmethod
     def _setup_radare2() -> None:
         if sys.platform.startswith("linux"):
-            os.system("apt-get install radare2")
+            os.system("apt-get update")
+            os.system("apt-get install -y --no-install-recommends wget")
+            os.system(f"wget -O /tmp/radare2_${radare2_version}_arm64.deb https://github.com/radareorg/radare2/releases/download/${radare2_version}/radare2_${radare2_version}_arm64.deb")
+            os.system(f"dpkg -i /tmp/radare2_${radare2_version}_arm64.deb")
+            os.system("r2pm init")
+            os.system("r2pm update")
+            os.system(f"rm /tmp/radare2_${radare2_version}_arm64.deb")
         elif sys.platform.startswith("darwin"):
             os.system("brew install radare2")
         else:
             print("Ensure 'radar2' is installed...")
 
     def run(self):
-        _install.run(self)
         self._setup_radare2()
+        _install.run(self)
+
 
 
 def readme():

From 3a2db7750f8038b0be66a4b3f56abc9b4a62be08 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Wed, 4 Oct 2023 17:41:23 +0100
Subject: [PATCH 48/59] AEGIS-6405 - radare2 test 3

---
 requirements.txt |  1 -
 setup.py         | 20 ++++++++++++--------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index c846480..d92495b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
 torch>=1.7,<2
 click>=7.1,<8
 r2pipe>=1.5,<2
-r2env>=0.5.7,<1
diff --git a/setup.py b/setup.py
index e594ee2..b5a1ba8 100644
--- a/setup.py
+++ b/setup.py
@@ -10,13 +10,18 @@ class install(_install):
     @staticmethod
     def _setup_radare2() -> None:
         if sys.platform.startswith("linux"):
-            os.system("apt-get update")
-            os.system("apt-get install -y --no-install-recommends wget")
-            os.system(f"wget -O /tmp/radare2_${radare2_version}_arm64.deb https://github.com/radareorg/radare2/releases/download/${radare2_version}/radare2_${radare2_version}_arm64.deb")
-            os.system(f"dpkg -i /tmp/radare2_${radare2_version}_arm64.deb")
-            os.system("r2pm init")
-            os.system("r2pm update")
-            os.system(f"rm /tmp/radare2_${radare2_version}_arm64.deb")
+            commands = [
+                "apt-get update",
+                "apt-get install -y --no-install-recommends wget",
+                f"wget -O /tmp/radare2_{radare2_version}_arm64.deb https://github.com/radareorg/radare2/releases/download/{radare2_version}/radare2_{radare2_version}_arm64.deb",
+                f"dpkg -i /tmp/radare2_{radare2_version}_arm64.deb",
+                "r2pm init",
+                "r2pm update",
+                f"rm /tmp/radare2_{radare2_version}_arm64.deb"
+            ]
+            for command in commands:
+                if os.system(command) != 0:
+                    raise Exception(f"Install radare2 failed: '{command}'")
         elif sys.platform.startswith("darwin"):
             os.system("brew install radare2")
         else:
@@ -27,7 +32,6 @@ def run(self):
         _install.run(self)
 
 
-
 def readme():
     with open('README.md') as f:
         return f.read()

From d36eced48ea68c97de849a59fa4679898512ab30 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Wed, 4 Oct 2023 18:19:58 +0100
Subject: [PATCH 49/59] AEGIS-6405 - setup arch

---
 setup.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index b5a1ba8..19a3051 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import platform
 from setuptools import setup, find_packages
 from setuptools.command.install import install as _install
 
@@ -9,21 +10,29 @@
 class install(_install):
     @staticmethod
     def _setup_radare2() -> None:
-        if sys.platform.startswith("linux"):
+        if sys.platform.startswith("linux"):  # Install required in Docker images
+            machine = platform.machine()
+            if machine in ["aarch64", "arm"]:
+                architecture = "arm64"
+            elif machine in ["x86_64"]:
+                architecture = "amd64"
+            elif machine in ["i386", "i686"]:
+                architecture = "i386"
+            else:
+                raise Exception(f"No architecture for Linux Machine: '{machine}'")
+
             commands = [
                 "apt-get update",
                 "apt-get install -y --no-install-recommends wget",
-                f"wget -O /tmp/radare2_{radare2_version}_arm64.deb https://github.com/radareorg/radare2/releases/download/{radare2_version}/radare2_{radare2_version}_arm64.deb",
-                f"dpkg -i /tmp/radare2_{radare2_version}_arm64.deb",
+                f"wget -O /tmp/radare2_{radare2_version}_{architecture}.deb https://github.com/radareorg/radare2/releases/download/{radare2_version}/radare2_{radare2_version}_{architecture}.deb",
+                f"dpkg -i /tmp/radare2_{radare2_version}_{architecture}.deb",
                 "r2pm init",
                 "r2pm update",
-                f"rm /tmp/radare2_{radare2_version}_arm64.deb"
+                f"rm /tmp/radare2_{radare2_version}_{architecture}.deb"
             ]
             for command in commands:
                 if os.system(command) != 0:
                     raise Exception(f"Install radare2 failed: '{command}'")
-        elif sys.platform.startswith("darwin"):
-            os.system("brew install radare2")
         else:
             print("Ensure 'radar2' is installed...")
 

From 7ab939b6e5a77eb17107313af3516176d5a4df72 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Thu, 5 Oct 2023 11:16:42 +0100
Subject: [PATCH 50/59] AEGIS-6406 - moved scripts

---
 README.md             | 166 ++++--------------------------------------
 asm2vec/__init__.py   |   5 +-
 asm2vec/data.py       |  43 +++++++++++
 asm2vec/model.py      |  32 +++++++-
 asm2vec/similarity.py |  42 +++++++++++
 asm2vec/test.py       |  39 ++++++++++
 asm2vec/train.py      | 162 ++++++-----------------------------------
 asm2vec/utilities.py  |  55 ++++++++++++++
 requirements.txt      |   1 -
 scripts/compare.py    |  44 -----------
 scripts/test.py       |  44 -----------
 11 files changed, 251 insertions(+), 382 deletions(-)
 create mode 100644 asm2vec/data.py
 create mode 100644 asm2vec/similarity.py
 create mode 100644 asm2vec/test.py
 create mode 100644 asm2vec/utilities.py
 delete mode 100644 scripts/compare.py
 delete mode 100644 scripts/test.py

diff --git a/README.md b/README.md
index c5fc4ae..637d5db 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # asm2vec-pytorch
 
-<a><img alt="release 1.0.0" src="https://img.shields.io/badge/release-v1.0.0-yellow?style=for-the-badge"></a>
+<a><img alt="release 1.0.3" src="https://img.shields.io/badge/release-v1.0.0-yellow?style=for-the-badge"></a>
 <a><img alt="mit" src="https://img.shields.io/badge/license-MIT-brightgreen?style=for-the-badge"></a>
 <a><img alt="python" src="https://img.shields.io/badge/-python-9cf?style=for-the-badge&logo=python"></a>
 
@@ -9,30 +9,17 @@ The details of the model can be found in the original paper: [(sp'19) Asm2Vec: B
 
 ## Requirements
 
-python >= 3.10
-
-| packages | for |
-| --- | --- |
-| r2pipe | `scripts/bin2asm.py` |
-| click | `scripts/*` |
-| torch | almost all code need it |
-
-You also need to install `radare2` to run `scripts/bin2asm.py`. `r2pipe` is just the python interface to `radare2`
-
-If you only want to use the library code, you just need to install `torch`
+* python >= 3.10
+* radare2
+* Packages listed in `requirements.txt`
 
 ## Install
 
 ```
+pip install -r requirements.txt && 
 python setup.py install
 ```
 
-or
-
-```
-pip install git+https://github.com/oalieno/asm2vec-pytorch.git
-```
-
 ## Benchmark
 
 An implementation already exists here: [Lancern/asm2vec](https://github.com/Lancern/asm2vec)  
@@ -46,141 +33,20 @@ Following is the benchmark of training 1000 functions in 1 epoch.
 
 ## Get Started
 
-```bash
-python scripts/bin2asm.py -i /bin/ -o asm/
-```
-
-First generate asm files from binarys under `/bin/`.  
-You can hit `Ctrl+C` anytime when there is enough data.
-
-```bash
-python scripts/train.py -i asm/ -l 100 -o model.pt --epochs 100
-```
-
-Try to train the model using only 100 functions and 100 epochs for a taste.  
-Then you can use more data if you want.
-
-```bash
-python scripts/test.py -i asm/123456 -m model.pt
-```
-
-After you train your model, try to grab an assembly function and see the result.  
-This script will show you how the model perform.  
-Once you satisfied, you can take out the embedding vector of the function and do whatever you want with it.
+### TODO - update this with description about to how use etc
 
-## Usage
+## Tests
 
-### bin2asm.py
+### Run test suite
 
-```
-Usage: bin2asm.py [OPTIONS]
+* Run all tests: ``python -m unittest discover -v``
+* Run a certain module's tests: ``python -m unittest -v test.test_binary_to_asm``
+* Run a certain test class: ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm``
+* Run a certain test method: 
 
-  Extract assembly functions from binary executable
+  ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm.test_sha3``
 
-Options:
-  -i, --input TEXT   input directory / file  [required]
-  -o, --output TEXT  output directory
-  -l, --len INTEGER  ignore assembly code with instructions amount smaller
-                     than minlen
+### Coverage
 
-  --help             Show this message and exit.
-```
-
-```bash
-# Example
-python bin2asm.py -i /bin/ -o asm/
-```
-
-### train.py
-
-```
-Usage: train.py [OPTIONS]
-
-Options:
-  -i, --input TEXT                training data folder  [required]
-  -o, --output TEXT               output model path  [default: model.pt]
-  -m, --model TEXT                load previous trained model path
-  -l, --limit INTEGER             limit the number of functions to be loaded
-  -d, --ebedding-dimension INTEGER
-                                  embedding dimension  [default: 100]
-  -b, --batch-size INTEGER        batch size  [default: 1024]
-  -e, --epochs INTEGER            training epochs  [default: 10]
-  -n, --neg-sample-num INTEGER    negative sampling amount  [default: 25]
-  -a, --calculate-accuracy        whether calculate accuracy ( will be
-                                  significantly slower )
-
-  -c, --device TEXT               hardware device to be used: cpu / cuda /
-                                  auto  [default: auto]
-
-  -lr, --learning-rate FLOAT      learning rate  [default: 0.02]
-  --help                          Show this message and exit.
-```
-
-```bash
-# Example
-python train.py -i asm/ -o model.pt --epochs 100
-```
-
-### test.py
-
-```
-Usage: test.py [OPTIONS]
-
-Options:
-  -i, --input TEXT              target function  [required]
-  -m, --model TEXT              model path  [required]
-  -e, --epochs INTEGER          training epochs  [default: 10]
-  -n, --neg-sample-num INTEGER  negative sampling amount  [default: 25]
-  -l, --limit INTEGER           limit the amount of output probability result
-  -c, --device TEXT             hardware device to be used: cpu / cuda / auto
-                                [default: auto]
-
-  -lr, --learning-rate FLOAT    learning rate  [default: 0.02]
-  -p, --pretty                  pretty print table  [default: False]
-  --help                        Show this message and exit.
-```
-
-```bash
-# Example
-python test.py -i asm/123456 -m model.pt
-```
-
-```
-┌──────────────────────────────────────────┐
-│    endbr64                               │
-│  ➔ push r15                              │
-│    push r14                              │
-├────────┬─────────────────────────────────┤
-│ 34.68% │ [rdx + rsi*CONST + CONST]       │
-│ 20.29% │ push                            │
-│ 16.22% │ r15                             │
-│ 04.36% │ r14                             │
-│ 03.55% │ r11d                            │
-└────────┴─────────────────────────────────┘
-```
-
-### compare.py
-
-```
-Usage: compare.py [OPTIONS]
-
-Options:
-  -i1, --input1 TEXT          target function 1  [required]
-  -i2, --input2 TEXT          target function 2  [required]
-  -m, --model TEXT            model path  [required]
-  -e, --epochs INTEGER        training epochs  [default: 10]
-  -c, --device TEXT           hardware device to be used: cpu / cuda / auto
-                              [default: auto]
-
-  -lr, --learning-rate FLOAT  learning rate  [default: 0.02]
-  --help                      Show this message and exit.
-```
-
-```bash
-# Example
-python compare.py -i1 asm/123456 -i2 asm/654321 -m model.pt -e 30
-```
-
-```
-cosine similarity : 0.873684
-```
+* Create report: ``coverage run -m unittest discover -v``
+* Read report: ``coverage report -m``
\ No newline at end of file
diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py
index 2d9cfd9..6e9d963 100644
--- a/asm2vec/__init__.py
+++ b/asm2vec/__init__.py
@@ -3,4 +3,7 @@
 __home__ = os.path.dirname(os.path.abspath(__path__[0]))
 __data__ = os.path.join(__home__, "data")
 
-__all__ = ["__data__", "__home__", "binary_to_asm", "datatype", "model", "tensors", "train", "utils", "version"]
+__all__ = [
+    "__data__", "__home__", "binary_to_asm", "data", "datatype", "model", "similarity", "tensors", "test", "train",
+    "utilities", "version"
+]
diff --git a/asm2vec/data.py b/asm2vec/data.py
new file mode 100644
index 0000000..6713c38
--- /dev/null
+++ b/asm2vec/data.py
@@ -0,0 +1,43 @@
+import os
+from pathlib import Path
+from torch.utils.data import Dataset
+
+from asm2vec.datatype import Tokens, Function
+
+
+class AsmDataset(Dataset):
+    # TODO - doc string - explain what this class does - how does it extend `Dataset`?
+    def __init__(self, x, y):
+        self.x = x
+        self.y = y
+
+    def __len__(self):
+        return len(self.x)
+
+    def __getitem__(self, index):
+        return self.x[index], self.y[index]
+
+
+def load_data(paths, limit=None):
+    # TODO - doc string
+    if type(paths) is not list:
+        paths = [paths]
+
+    filenames = []
+    for path in paths:
+        if os.path.isdir(path):
+            filenames += [Path(path) / filename for filename in sorted(os.listdir(path))
+                          if os.path.isfile(Path(path) / filename)]
+        else:
+            filenames += [Path(path)]
+
+    functions, tokens = [], Tokens()
+    for i, filename in enumerate(filenames):
+        if limit and i >= limit:
+            break
+        with open(filename) as f:
+            fn = Function.load(f.read())
+            functions.append(fn)
+            tokens.add(fn.tokens())
+
+    return functions, tokens
diff --git a/asm2vec/model.py b/asm2vec/model.py
index 74a6ace..51dc433 100644
--- a/asm2vec/model.py
+++ b/asm2vec/model.py
@@ -1,9 +1,14 @@
 import torch
 import torch.nn as nn
 
+from asm2vec.datatype import Tokens
+
 bce, sigmoid, softmax = nn.BCELoss(), nn.Sigmoid(), nn.Softmax(dim=1)
 
 
+# TODO - doc strings
+
+
 class ASM2VEC(nn.Module):
     def __init__(self, vocab_size, function_size, embedding_size):
         super(ASM2VEC, self).__init__()
@@ -44,9 +49,34 @@ def forward(self, inp, pos, neg):
         label = torch.cat([torch.ones(batch_size, 3), torch.zeros(batch_size, neg.shape[1])], dim=1).to(device)
         return bce(sigmoid(pred), label)
 
-    def predict(self, inp, pos):
+    def predict(self, inp, pos):  # Why is pos not used? Why does Predict differ so much from Forward?
         device, batch_size = inp.device, inp.shape[0]
         v = self.v(inp)
         probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1).
                                             to(device)), v).squeeze(dim=2)
         return softmax(probs)
+
+
+def save_model(path: str, model: ASM2VEC, tokens: Tokens) -> None:
+    torch.save(
+        {
+            'model_params': (
+                model.embeddings.num_embeddings,
+                model.embeddings_f.num_embeddings,
+                model.embeddings.embedding_dim
+            ),
+            'model': model.state_dict(),
+            'tokens': tokens.state_dict(),
+        },
+        path
+    )
+
+
+def load_model(path: str, device: str = 'cpu') -> tuple[ASM2VEC, Tokens]:
+    checkpoint = torch.load(path, map_location=device)
+    tokens = Tokens()
+    tokens.load_state_dict(checkpoint['tokens'])
+    model = ASM2VEC(*checkpoint['model_params'])
+    model.load_state_dict(checkpoint['model'])
+    model = model.to(device)
+    return model, tokens
diff --git a/asm2vec/similarity.py b/asm2vec/similarity.py
new file mode 100644
index 0000000..bce31b6
--- /dev/null
+++ b/asm2vec/similarity.py
@@ -0,0 +1,42 @@
+import torch
+
+from asm2vec.data import load_data
+from asm2vec.model import load_model
+from asm2vec.train import train
+
+
+def cosine_similarity(v1, v2) -> float:
+    return (v1 @ v2 / (v1.norm() * v2.norm())).item()
+
+
+def compare_two(
+        data_path_1: str, data_path_2: str, model_path: str, epochs: int = 10, device: str = "cpu",
+        learning_rate: float = 0.02
+) -> float:
+    # TODO - doc string
+    if device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    # load model, tokens
+    model, tokens = load_model(model_path, device=device)
+    functions, tokens_new = load_data([data_path_1, data_path_2])
+    tokens.update(tokens_new)
+    model.update(2, tokens.size())
+    model = model.to(device)
+    
+    # train function embedding
+    model = train(
+        functions,
+        tokens,
+        model=model,
+        epochs=epochs,
+        device=device,
+        mode="test",
+        learning_rate=learning_rate
+    )
+
+    # compare 2 function vectors
+    v1, v2 = model.to("cpu").embeddings_f(torch.tensor([0, 1]))
+    similarity = cosine_similarity(v1, v2)
+    print(f"cosine similarity : {similarity:.6f}")
+    return similarity
diff --git a/asm2vec/test.py b/asm2vec/test.py
new file mode 100644
index 0000000..c4ef7ba
--- /dev/null
+++ b/asm2vec/test.py
@@ -0,0 +1,39 @@
+import torch
+
+from asm2vec.data import load_data
+from asm2vec.model import load_model
+from asm2vec.train import train, preprocess
+from asm2vec.utilities import show_probs
+
+
+def test_model(
+        data_path: str, model_path: str, epochs: int = 10, neg_sample_num: int = 25, limit: int | None = None,
+        device: str = "cpu", learning_rate: float = 0.02, pretty: bool = False
+) -> None:
+    # TODO - doc string
+    if device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    # load model, tokens
+    model, tokens = load_model(model_path, device=device)
+    functions, tokens_new = load_data(data_path)
+    tokens.update(tokens_new)
+    model.update(1, tokens.size())
+    model = model.to(device)
+
+    # train function embedding
+    model = train(
+        functions,
+        tokens,
+        model=model,
+        epochs=epochs,
+        neg_sample_num=neg_sample_num,
+        device=device,
+        mode="test",
+        learning_rate=learning_rate
+    )
+
+    # show predicted probability results
+    x, y = preprocess(functions, tokens)
+    probs = model.predict(x.to(device), y.to(device))
+    show_probs(x, y, probs, tokens, limit=limit, pretty=pretty)
diff --git a/asm2vec/train.py b/asm2vec/train.py
index 12b8fe5..eb418d4 100644
--- a/asm2vec/train.py
+++ b/asm2vec/train.py
@@ -1,49 +1,12 @@
-import os
 import time
 import torch
-import logging
 from pathlib import Path
-from torch.utils.data import DataLoader, Dataset
-from asm2vec.model import ASM2VEC
-from asm2vec.datatype import Tokens, Function, Instruction
+from torch.utils.data import DataLoader
 
-logging.basicConfig(level=logging.INFO, format='%(message)s')
-
-
-class AsmDataset(Dataset):
-    def __init__(self, x, y):
-        self.x = x
-        self.y = y
-
-    def __len__(self):
-        return len(self.x)
-
-    def __getitem__(self, index):
-        return self.x[index], self.y[index]
-
-
-def load_data(paths, limit=None):
-    if type(paths) is not list:
-        paths = [paths]
-
-    filenames = []
-    for path in paths:
-        if os.path.isdir(path):
-            filenames += [Path(path) / filename for filename in sorted(os.listdir(path))
-                          if os.path.isfile(Path(path) / filename)]
-        else:
-            filenames += [Path(path)]
-
-    functions, tokens = [], Tokens()
-    for i, filename in enumerate(filenames):
-        if limit and i >= limit:
-            break
-        with open(filename) as f:
-            fn = Function.load(f.read())
-            functions.append(fn)
-            tokens.add(fn.tokens())
-
-    return functions, tokens
+from asm2vec.data import AsmDataset, load_data
+from asm2vec.datatype import Function, Tokens
+from asm2vec.model import ASM2VEC, load_model, save_model
+from asm2vec.utilities import accuracy, callback
 
 
 def preprocess(functions, tokens):
@@ -57,19 +20,12 @@ def preprocess(functions, tokens):
 
 
 def train(
-        functions,
-        tokens,
-        model=None,
-        embedding_size=100,
-        batch_size=1024,
-        epochs=10,
-        neg_sample_num=25,
-        calc_acc=False,
-        device='cpu',
-        mode='train',
-        callback=None,
-        learning_rate=0.02
+        functions: list[Function], tokens: Tokens, model: ASM2VEC | None = None, embedding_size: int = 100,
+        batch_size: int = 1024, epochs: int = 10, neg_sample_num: int = 25, calc_acc: bool = False, device: str = 'cpu',
+        mode: str = 'train', verbose: bool = False, learning_rate: float = 0.02
 ):
+    # TODO: doc string
+    # TODO: test mode in train... this is confusing!
     if mode == 'train':
         if model is None:
             model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device)
@@ -100,7 +56,7 @@ def train(
                 probs = model.predict(inp.to(device), pos.to(device))
                 accs.append(accuracy(pos, probs))
 
-        if callback:
+        if verbose:
             callback({
                 'model': model,
                 'tokens': tokens,
@@ -113,98 +69,22 @@ def train(
     return model
 
 
-def save_model(path, model, tokens):
-    torch.save({
-        'model_params': (
-            model.embeddings.num_embeddings,
-            model.embeddings_f.num_embeddings,
-            model.embeddings.embedding_dim
-        ),
-        'model': model.state_dict(),
-        'tokens': tokens.state_dict(),
-    }, path)
-
-
-def load_model(path, device='cpu'):
-    checkpoint = torch.load(path, map_location=device)
-    tokens = Tokens()
-    tokens.load_state_dict(checkpoint['tokens'])
-    model = ASM2VEC(*checkpoint['model_params'])
-    model.load_state_dict(checkpoint['model'])
-    model = model.to(device)
-    return model, tokens
-
-
-def show_probs(x, y, probs, tokens, limit=None, pretty=False):
-    if pretty:
-        tl, tr, bl, br = '┌', '┐', '└', '┘'
-        lm, rm, tm, bm = '├', '┤', '┬', '┴'
-        h, v = '─', '│'
-        arrow = ' ➔'
-    else:
-        tl, tr, bl, br = '+', '+', '+', '+'
-        lm, rm, tm, bm = '+', '+', '+', '+'
-        h, v = '-', '|'
-        arrow = '->'
-    top = probs.topk(5)
-    for i, (xi, yi) in enumerate(zip(x, y)):
-        if limit and i >= limit:
-            break
-        xi, yi = xi.tolist(), yi.tolist()
-        print(tl + h * 42 + tr)
-        print(f'{v}    {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}')
-        print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}')
-        print(f'{v}    {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}')
-        print(lm + h * 8 + tm + h * 33 + rm)
-        for value, index in zip(top.values[i], top.indices[i]):
-            if index in yi:
-                colorbegin, colorclear = '\033[92m', '\033[0m'
-            else:
-                colorbegin, colorclear = '', ''
-            print(f'{v} {colorbegin}{value * 100:05.2f}%{colorclear} {v} {colorbegin}'
-                  f'{tokens[index.item()].name:31}{colorclear} {v}')
-        print(bl + h * 8 + bm + h * 33 + br)
-
-
-def accuracy(y, probs):
-    return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)]))
-
-
-def callback(context) -> None:
-    """Prettifies the display of accuracy, if chosen
-    """
-    progress = f'{context["epoch"]} | time = {context["time"]:.2f},\
-                  loss = {context["loss"]:.4f}'
-
-    if context["accuracy"]:
-        progress += f', accuracy = {context["accuracy"]:.4f}'
-    logging.info(f"{progress}")
-
-
 def train_asm2vec_model(
-        train_set: str,
-        new_model: str,
-        model_path: str | None,
-        epochs: int,
-        limit: int = None,
-        calc_acc: bool = False,
-        embedding_size: int = 100,
-        batch_size: int = 1024,
-        neg_sample: int = 25,
-        learning_rate: float = 0.02,
-        device: str = 'cpu'
+        train_set: str, new_model: str, model_path: str | None, epochs: int, limit: int | None = None,
+        calc_acc: bool = False, embedding_size: int = 100, batch_size: int = 1024, neg_sample: int = 25,
+        learning_rate: float = 0.02, device: str = 'cpu'
 ) -> ASM2VEC:
-
-    """Trains an asm2vec model
+    # TODO - this is just a wrapper - can we do this smarter?
+    """Trains an ASM2VEC model
     :param train_set: path to the training dataset
     :param new_model: path to the model to be trained
     :param model_path: path to already trained model
-    :param limit: number of the assembly functions that the model will be trained on;
-    if not defined, all the assembly functions in train_set_path
+    :param limit: number of the assembly functions that the model will be trained on; if not defined, all the assembly
+        functions in train_set_path
     :param epochs: number of epochs
     :param calc_acc: displays the accuracy per training epoch; setting it to True will slow down the training
-    :param embedding_size: size of the vector representation for a token; an assembly function
-    will be represented with a vector twice that size
+    :param embedding_size: size of the vector representation for a token; an assembly function will be represented
+        with a vector twice that size
     :param batch_size: the size of batches for training
     :param neg_sample: negative sampling amount
     :param device: 'auto' | 'cuda' | 'cpu'
@@ -233,7 +113,7 @@ def train_asm2vec_model(
         neg_sample_num=neg_sample,
         calc_acc=calc_acc,
         device=device,
-        callback=callback,
+        verbose=True,
         learning_rate=learning_rate
     )
     save_model(new_model, model, tokens)
diff --git a/asm2vec/utilities.py b/asm2vec/utilities.py
new file mode 100644
index 0000000..dd39aac
--- /dev/null
+++ b/asm2vec/utilities.py
@@ -0,0 +1,55 @@
+import logging
+import torch
+
+from asm2vec.datatype import Instruction
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+# TODO - Why do we have both logging and print?
+# TODO - Doc strings
+
+def accuracy(y, probs):
+    return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)]))
+
+
+def callback(context) -> None:
+    """Prettifies the display of accuracy, if chosen
+    """
+    progress = f'{context["epoch"]} | time = {context["time"]:.2f},\
+                  loss = {context["loss"]:.4f}'
+
+    if context["accuracy"]:
+        progress += f', accuracy = {context["accuracy"]:.4f}'
+    logging.info(f"{progress}")
+
+
+def show_probs(x, y, probs, tokens, limit=None, pretty=False):
+    if pretty:
+        tl, tr, bl, br = '┌', '┐', '└', '┘'
+        lm, rm, tm, bm = '├', '┤', '┬', '┴'
+        h, v = '─', '│'
+        arrow = ' ➔'
+    else:
+        tl, tr, bl, br = '+', '+', '+', '+'
+        lm, rm, tm, bm = '+', '+', '+', '+'
+        h, v = '-', '|'
+        arrow = '->'
+    top = probs.topk(5)
+    for i, (xi, yi) in enumerate(zip(x, y)):
+        if limit and i >= limit:
+            break
+        xi, yi = xi.tolist(), yi.tolist()
+        print(tl + h * 42 + tr)
+        print(f'{v}    {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}')
+        print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}')
+        print(f'{v}    {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}')
+        print(lm + h * 8 + tm + h * 33 + rm)
+        for value, index in zip(top.values[i], top.indices[i]):
+            if index in yi:
+                colorbegin, colorclear = '\033[92m', '\033[0m'
+            else:
+                colorbegin, colorclear = '', ''
+            print(f'{v} {colorbegin}{value * 100:05.2f}%{colorclear} {v} {colorbegin}'
+                  f'{tokens[index.item()].name:31}{colorclear} {v}')
+        print(bl + h * 8 + bm + h * 33 + br)
diff --git a/requirements.txt b/requirements.txt
index d92495b..3163633 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,2 @@
 torch>=1.7,<2
-click>=7.1,<8
 r2pipe>=1.5,<2
diff --git a/scripts/compare.py b/scripts/compare.py
deleted file mode 100644
index 3860b83..0000000
--- a/scripts/compare.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-import click
-import asm2vec
-
-def cosine_similarity(v1, v2):
-    return (v1 @ v2 / (v1.norm() * v2.norm())).item()
-
-@click.command()
-@click.option('-i1', '--input1', 'ipath1', help='target function 1', required=True)
-@click.option('-i2', '--input2', 'ipath2', help='target function 2', required=True)
-@click.option('-m', '--model', 'mpath', help='model path', required=True)
-@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
-@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
-@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
-def cli(ipath1, ipath2, mpath, epochs, device, lr):
-    if device == 'auto':
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-    # load model, tokens
-    model, tokens = asm2vec.utils.load_model(mpath, device=device)
-    functions, tokens_new = asm2vec.utils.load_data([ipath1, ipath2])
-    tokens.update(tokens_new)
-    model.update(2, tokens.size())
-    model = model.to(device)
-    
-    # train function embedding
-    model = asm2vec.utils.train(
-        functions,
-        tokens,
-        model=model,
-        epochs=epochs,
-        device=device,
-        mode='test',
-        learning_rate=lr
-    )
-
-    # compare 2 function vectors
-    v1, v2 = model.to('cpu').embeddings_f(torch.tensor([0, 1]))
-
-    print(f'cosine similarity : {cosine_similarity(v1, v2):.6f}')
-
-if __name__ == '__main__':
-    cli()
diff --git a/scripts/test.py b/scripts/test.py
deleted file mode 100644
index 31372aa..0000000
--- a/scripts/test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-import click
-import asm2vec
-
-@click.command()
-@click.option('-i', '--input', 'ipath', help='target function', required=True)
-@click.option('-m', '--model', 'mpath', help='model path', required=True)
-@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
-@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True)
-@click.option('-l', '--limit', help='limit the amount of output probability result', type=int)
-@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
-@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
-@click.option('-p', '--pretty', default=False, help='pretty print table', show_default=True, is_flag=True)
-def cli(ipath, mpath, epochs, neg_sample_num, limit, device, lr, pretty):
-    if device == 'auto':
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-    # load model, tokens
-    model, tokens = asm2vec.utils.load_model(mpath, device=device)
-    functions, tokens_new = asm2vec.utils.load_data(ipath)
-    tokens.update(tokens_new)
-    model.update(1, tokens.size())
-    model = model.to(device)
-
-    # train function embedding
-    model = asm2vec.utils.train(
-        functions,
-        tokens,
-        model=model,
-        epochs=epochs,
-        neg_sample_num=neg_sample_num,
-        device=device,
-        mode='test',
-        learning_rate=lr
-    )
-
-    # show predicted probability results
-    x, y = asm2vec.utils.preprocess(functions, tokens)
-    probs = model.predict(x.to(device), y.to(device))
-    asm2vec.utils.show_probs(x, y, probs, tokens, limit=limit, pretty=pretty)
-
-if __name__ == '__main__':
-    cli()

From 8f572c35e39b980bc544e258abcffebe87fcb0e9 Mon Sep 17 00:00:00 2001
From: Jamie Nutter <64031696+jamienutter@users.noreply.github.com>
Date: Thu, 5 Oct 2023 11:51:52 +0100
Subject: [PATCH 51/59] TRIVIAL -  doc strings

---
 asm2vec/binary_to_asm.py | 61 ++++++++++++++++++++++------------------
 asm2vec/datatype.py      |  2 ++
 asm2vec/tensors.py       | 30 ++++++++++----------
 3 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
index 3c141d9..58ccaa1 100644
--- a/asm2vec/binary_to_asm.py
+++ b/asm2vec/binary_to_asm.py
@@ -9,16 +9,19 @@
 
 
 def _sha3(asm: str) -> str:
-    """Produces SHA3 for each assembly function
-    :param asm: input assembly function
+    """
+    Produces SHA3 for each assembly function
+    :param asm: Input assembly function
+    :return: Hashed string
     """
     return hashlib.sha3_256(asm.encode()).hexdigest()
 
 
 def _valid_exe(filename: str, magic_bytes: list[str]) -> bool:
-    """Extracts magic bytes and returns the header
-    :param filename: name of the malware file (SHA1)
-    :param magic_bytes for the specific OS/type of binary
+    """
+    Extracts magic bytes and returns the header
+    :param filename: Name of the malware file (SHA1)
+    :param magic_bytes: For the specific OS/type of binary
     :return: Boolean of the header existing in magic bytes
     """
     magics = [bytes.fromhex(i) for i in magic_bytes]
@@ -28,8 +31,10 @@ def _valid_exe(filename: str, magic_bytes: list[str]) -> bool:
 
 
 def _normalize(opcode: str) -> str:
-    """ Normalizes the input string
-    :param opcode: opcode of the binary
+    """
+    Normalizes the input opcode string
+    :param opcode: Opcode of the binary
+    :return Normalized opcode string
     """
     opcode = opcode.replace(' - ', ' + ')
     opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
@@ -39,9 +44,11 @@ def _normalize(opcode: str) -> str:
 
 
 def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str:
-    """Converts functions to assembly code
+    """
+    Converts functions to assembly code
     :param pdf: disassembly
     :param asm_minlen: minimum length of assembly functions to be extracted
+    :return: ASM string
     """
     if pdf is None:
         return ''
@@ -71,7 +78,8 @@ def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str:
 
 
 def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes: list[str]) -> int:
-    """Fragments the input binary into assembly functions via r2pipe
+    """
+    Fragments the input binary into assembly functions via r2pipe
     :param filename: name of the malware file  (SHA1)
     :param output_path: path to the folder to store the assembly functions for each malware
     :param asm_minlen: the minimum length of assembly functions to be extracted
@@ -102,25 +110,22 @@ def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes:
     return count
 
 
-def convert_to_asm(input_path: str,
-                   output_path: str,
-                   minlen_upper: int,
-                   minlen_lower: int,
-                   magic_bytes: list[str] = None
-                   ) -> list:
-    """ Extracts assembly functions from malware files and saves them
-    into separate folder per binary
-    :param input_path: the path to the malware binaries
-    :param output_path: the path for the assembly functions to be extracted
-    :param minlen_upper: The minimum number of assembly functions needed for disassembling
-    :param minlen_lower: If disassembling not possible with with minlen_upper, lower the minimum number
-    of assembly functions to minlen_lower
-    :param magic_bytes: list of valid for the specific OS/type of binary; e.g.
-    'cffaedfe' for Mach-O Little Endian (64-bit)
-    'feedfacf' for Mach-O Big Endian (64-bit)
-    'cefaedfe' for Mach-O Little Endian (32-bit)
-    'feedface': Mach-O Big Endian (32-bit)
-    'cafebabe'  Universal Binary Big Endian
+def convert_to_asm(
+        input_path: str, output_path: str, minlen_upper: int, minlen_lower: int, magic_bytes: list[str] = None
+) -> list:
+    """
+    Extracts assembly functions from malware files and saves them into separate folder per binary
+    :param input_path: Path to the malware binaries
+    :param output_path: Path for the assembly functions to be extracted
+    :param minlen_upper: Minimum number of assembly functions needed for disassembling
+    :param minlen_lower: If disassembling is not possible with minlen_upper, lower the minimum number of assembly
+        functions to minlen_lower (WHAT?)
+    :param magic_bytes: List of valid for the specific OS/type of binary, e.g.
+        - 'cffaedfe' for Mach-O Little Endian (64-bit)
+        - 'feedfacf' for Mach-O Big Endian (64-bit)
+        - 'cefaedfe' for Mach-O Little Endian (32-bit)
+        - 'feedface': Mach-O Big Endian (32-bit)
+        - 'cafebabe'  Universal Binary Big Endian
     :return: List of sha1 of disassembled malware files
     """
     if not magic_bytes:
diff --git a/asm2vec/datatype.py b/asm2vec/datatype.py
index b6451d8..f618800 100644
--- a/asm2vec/datatype.py
+++ b/asm2vec/datatype.py
@@ -2,6 +2,8 @@
 import random
 import warnings
 
+# TODO - doc strings
+
 
 class Token:
     def __init__(self, name, index):
diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py
index 01b306f..fde5296 100644
--- a/asm2vec/tensors.py
+++ b/asm2vec/tensors.py
@@ -1,26 +1,26 @@
 import os
 import torch
 import logging
-from asm2vec.train import train, load_model, load_data
 from pathlib import Path
 
+from asm2vec.train import train, load_model, load_data
+
 logging.basicConfig(level=logging.INFO, format='%(message)s')
 
 
-def calc_tensors(asm_path: str,
-                 tensor_path: str,
-                 model_path: str,
-                 epochs: int,
-                 device: str = 'cpu',
-                 learning_rate: float = 0.02) -> list:
-    """Calculates vector representation of a binary as the mean per column
-    of the vector representations of its assembly functions
-    :param asm_path: folder with assembly function in a subfolder per binary
-    :param tensor_path: folder to store the tensors
-    :param model_path: path to the trained model
-    :param epochs: number of epochs
-    :param device:  'auto' | 'cuda' | 'cpu'
-    :param learning_rate: learning rate
+def calc_tensors(
+        asm_path: str, tensor_path: str, model_path: str, epochs: int, device: str = 'cpu', learning_rate: float = 0.02
+) -> list:
+    """
+    Calculates vector representation of a binary as the mean per column of the vector representations of its assembly
+    functions.
+    :param asm_path: Path to folder with assembly function in a sub-folder per binary
+    :param tensor_path: Path to folder to store the tensors
+    :param model_path: Path to the trained model
+    :param epochs: Number of epochs
+    :param device: 'auto' | 'cuda' | 'cpu'
+    :param learning_rate: Learning rate
+    :return: List of tensors
     """
     tensors_list = []
     if device == 'auto':

From 1cf86db35d3645781f2408cab3653621fdd02869 Mon Sep 17 00:00:00 2001
From: "CI2.0" <ci@wandera.co.uk>
Date: Thu, 5 Oct 2023 15:03:00 +0000
Subject: [PATCH 52/59] [Jenkins] Set version to 1.0.3

---
 asm2vec/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/version.py b/asm2vec/version.py
index 500db07..c85dc7e 100644
--- a/asm2vec/version.py
+++ b/asm2vec/version.py
@@ -1,4 +1,4 @@
-VERSION = '1.0.2'
+VERSION = '1.0.3'
 DEV_VERSION = '0'
 
 radare2_version = "5.8.8"

From 9d794e25c4c2604a617d3d099000ecc38c6eee6f Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 23 Oct 2023 12:15:38 +0200
Subject: [PATCH 53/59] AEGIS-6406 rename "test" mode to "update"

---
 asm2vec/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/test.py b/asm2vec/test.py
index c4ef7ba..b80cc14 100644
--- a/asm2vec/test.py
+++ b/asm2vec/test.py
@@ -29,7 +29,7 @@ def test_model(
         epochs=epochs,
         neg_sample_num=neg_sample_num,
         device=device,
-        mode="test",
+        mode="update",
         learning_rate=learning_rate
     )
 

From 2fec9d1280cd89880676594398ac016ae12d26d1 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 23 Oct 2023 12:19:49 +0200
Subject: [PATCH 54/59] AEGIS-6406 add docstring, change "test" mode to
 "update" mode

---
 asm2vec/train.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/asm2vec/train.py b/asm2vec/train.py
index eb418d4..4de7a81 100644
--- a/asm2vec/train.py
+++ b/asm2vec/train.py
@@ -2,7 +2,6 @@
 import torch
 from pathlib import Path
 from torch.utils.data import DataLoader
-
 from asm2vec.data import AsmDataset, load_data
 from asm2vec.datatype import Function, Tokens
 from asm2vec.model import ASM2VEC, load_model, save_model
@@ -24,15 +23,28 @@ def train(
         batch_size: int = 1024, epochs: int = 10, neg_sample_num: int = 25, calc_acc: bool = False, device: str = 'cpu',
         mode: str = 'train', verbose: bool = False, learning_rate: float = 0.02
 ):
-    # TODO: doc string
-    # TODO: test mode in train... this is confusing!
+    """This function trains a model on the given assembly functions and tokens
+    :param functions: list of assembly functions
+    :param tokens: tokens (operations, operands) of the assembly function
+    :param model: type of the model; ; (Optional, default ASM2VEC)
+    :param embedding_size: size of the tensor representation of an assembly function; (Optional, default value = 100)
+    :param batch_size: size of the batch for each epoch of training; (Optional, default value = 1024)
+    :param epochs: number of epochs for training the model; (Optional, default value = 10)
+    :param neg_sample_num: size of the negative sample; (Optional, default value = 25)
+    :param calc_acc: if set to True, the accuracy per training epoch is displayed; (Optional, default False)
+    :param device: the device used for processing; (Optional, default 'cpu')
+    :param mode: 'train' (to train a new model) | 'update' (to add to an already trained  model's dictionary);
+    (Optional, default 'train')
+    :param verbose: if True performs training in verbose mode; (Optional, default False)
+    :param learning_rate: learning rate
+    """
     if mode == 'train':
         if model is None:
             model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device)
         optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-    elif mode == 'test':
+    elif mode == 'update':
         if model is None:
-            raise ValueError("test mode required pretrained model")
+            raise ValueError("Update mode requires a pretrained model")
         optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate)
     else:
         raise ValueError("Unknown mode")
@@ -89,6 +101,7 @@ def train_asm2vec_model(
     :param neg_sample: negative sampling amount
     :param device: 'auto' | 'cuda' | 'cpu'
     :param learning_rate: learning rate
+    :return an ASM2VEC model
     """
 
     if device == 'auto':

From 3c9833c921d9fc4812f94a232ef34043b225b592 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 23 Oct 2023 12:29:44 +0200
Subject: [PATCH 55/59] AEGIS-6406 add docstring, set mode to "update"

---
 asm2vec/similarity.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/asm2vec/similarity.py b/asm2vec/similarity.py
index bce31b6..ea52327 100644
--- a/asm2vec/similarity.py
+++ b/asm2vec/similarity.py
@@ -13,30 +13,36 @@ def compare_two(
         data_path_1: str, data_path_2: str, model_path: str, epochs: int = 10, device: str = "cpu",
         learning_rate: float = 0.02
 ) -> float:
-    # TODO - doc string
+    """This function produces the cosine similarity of a pair of assembly functions
+    :param data_path_1: the path to the assembly function no. 1
+    :param data_path_2: the path to the assembly function no. 2
+    :param model_path: the path to the trained asm2vec model
+    :param epochs: the number of epochs for calculating the tensor representations; (Optional, default = 10)
+    :param device: 'auto' | 'cuda' | 'cpu' (Optional, default 'cpu')
+    :param learning_rate: learning rate; (Optional; default = 0.02)
+    :return the cosine similarity value
+    """
     if device == "auto":
         device = "cuda" if torch.cuda.is_available() else "cpu"
 
-    # load model, tokens
     model, tokens = load_model(model_path, device=device)
     functions, tokens_new = load_data([data_path_1, data_path_2])
     tokens.update(tokens_new)
     model.update(2, tokens.size())
     model = model.to(device)
-    
-    # train function embedding
+
     model = train(
         functions,
         tokens,
         model=model,
         epochs=epochs,
         device=device,
-        mode="test",
+        mode="update",
         learning_rate=learning_rate
     )
 
-    # compare 2 function vectors
     v1, v2 = model.to("cpu").embeddings_f(torch.tensor([0, 1]))
     similarity = cosine_similarity(v1, v2)
-    print(f"cosine similarity : {similarity:.6f}")
+    print(f"Cosine similarity : {similarity:.6f}")
+
     return similarity

From 2a8433a97a8b3840022eb02a347d17aaa6f918f1 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Mon, 23 Oct 2023 14:39:23 +0200
Subject: [PATCH 56/59] AEGIS-6406 change mode from "test" to "update"

---
 asm2vec/tensors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py
index fde5296..78a356e 100644
--- a/asm2vec/tensors.py
+++ b/asm2vec/tensors.py
@@ -55,7 +55,7 @@ def calc_tensors(
                         model=model,
                         epochs=epochs,
                         device=device,
-                        mode='test',
+                        mode='update',
                         learning_rate=learning_rate
                     )
 

From 45e10f744047cdfc8eac40a7d7e808a03be6c4c3 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Wed, 25 Oct 2023 12:04:36 +0200
Subject: [PATCH 57/59] AEGIS-6406 add identation

Disassemble only if the folder does not exist
---
 asm2vec/binary_to_asm.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
index 58ccaa1..70218d7 100644
--- a/asm2vec/binary_to_asm.py
+++ b/asm2vec/binary_to_asm.py
@@ -145,18 +145,18 @@ def convert_to_asm(
             out_dir = os.path.join(asm_dir, entry.name)
             if not (os.path.exists(out_dir)):
                 os.mkdir(out_dir)
-            function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes)
-            if function_count == 0:
-                function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes)
+                function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes)
                 if function_count == 0:
-                    os.rmdir(out_dir)
-                    logging.info('The binary {} was not disassembled'.format(entry.name))
+                    function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes)
+                    if function_count == 0:
+                        os.rmdir(out_dir)
+                        logging.info('The binary {} was not disassembled'.format(entry.name))
+                    else:
+                        binary_count += 1
+                        disassembled_bins.append(entry.name)
                 else:
                     binary_count += 1
                     disassembled_bins.append(entry.name)
-            else:
-                binary_count += 1
-                disassembled_bins.append(entry.name)
     else:
         not_found += 1
         logging.info("[Error] No such file or directory: {}".format(binary_dir))

From 8d1b419e928890b2042b9772dd76cbb9a409be13 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Wed, 25 Oct 2023 13:09:22 +0200
Subject: [PATCH 58/59] AEGIS-6406 fix function_count

Correctly calculate function_count per binary, not cumulatively
---
 asm2vec/binary_to_asm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
index 70218d7..28b573f 100644
--- a/asm2vec/binary_to_asm.py
+++ b/asm2vec/binary_to_asm.py
@@ -145,9 +145,9 @@ def convert_to_asm(
             out_dir = os.path.join(asm_dir, entry.name)
             if not (os.path.exists(out_dir)):
                 os.mkdir(out_dir)
-                function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes)
+                function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes)
                 if function_count == 0:
-                    function_count += bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes)
+                    function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes)
                     if function_count == 0:
                         os.rmdir(out_dir)
                         logging.info('The binary {} was not disassembled'.format(entry.name))

From 90c9f991ad015f1ef28df92fd50b5e9041157c19 Mon Sep 17 00:00:00 2001
From: ilektragiassa <117294049+ilektragiassa@users.noreply.github.com>
Date: Wed, 25 Oct 2023 18:05:44 +0200
Subject: [PATCH 59/59] AEGIS-6406 add magic bytes

---
 asm2vec/binary_to_asm.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
index 28b573f..1da1389 100644
--- a/asm2vec/binary_to_asm.py
+++ b/asm2vec/binary_to_asm.py
@@ -121,15 +121,16 @@ def convert_to_asm(
     :param minlen_lower: If disassembling is not possible with minlen_upper, lower the minimum number of assembly
         functions to minlen_lower (WHAT?)
     :param magic_bytes: List of valid for the specific OS/type of binary, e.g.
-        - 'cffaedfe' for Mach-O Little Endian (64-bit)
-        - 'feedfacf' for Mach-O Big Endian (64-bit)
-        - 'cefaedfe' for Mach-O Little Endian (32-bit)
+        - 'cffaedfe': for Mach-O Little Endian (64-bit)
+        - 'feedfacf': for Mach-O Big Endian (64-bit)
+        - 'cefaedfe': for Mach-O Little Endian (32-bit)
         - 'feedface': Mach-O Big Endian (32-bit)
-        - 'cafebabe'  Universal Binary Big Endian
+        - 'cafebabe':  Universal Binary Big Endian
+        - 'bebafeca'
     :return: List of sha1 of disassembled malware files
     """
     if not magic_bytes:
-        magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe', 'cefaedfe', 'feedface']
+        magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe', 'cefaedfe', 'feedface', 'bebafeca']
 
     binary_dir = Path(input_path)
     asm_dir = Path(output_path)