hack2skill · rpiplewar · Nov 19, 2023 · Dec 6, 2023 · Dec 6, 2023 · Dec 15, 2023
diff --git a/.DS_Store b/.DS_Store
diff --git a/2023-10-16T08_31_00.337Z-oneAPI GenAI Hackathon _ Idea Submission PPT.pdf b/2023-10-16T08_31_00.337Z-oneAPI GenAI Hackathon _ Idea Submission PPT.pdf
diff --git a/712_finetuning/712data.jsonl b/712_finetuning/712data.jsonl
diff --git a/712_finetuning/data_cleaning.py b/712_finetuning/data_cleaning.py
@@ -0,0 +1,40 @@
+import pandas as pd
+import re
+import os
+
+basePath = os.path.dirname(os.path.abspath(__file__))
+df = pd.read_json(basePath + '/712data.jsonl')
+
+def remove_newline_chars(content):
+    if 'content' in content:
+        content['content'] = content['content'].replace('\n', ' ')
+    return content
+
+#Apply the function to the 'content' key in each dictionary
+df['messages'] = df['messages'].apply(lambda x: [remove_newline_chars(item) for item in x])
+
+df=df['messages']
+data=pd.DataFrame({'instructions':[], 'input':[],'output':[]})
+
+data = []
+for i in df:
+    data.append({'instruction':i[0]['content'],'input':i[1]['content'],'output':i[2]['content']})    #p
+df = pd.DataFrame(data)
+
+def convert_to_conversation(df):
+    data = []
+    # pattern1 = re.compile(r'Extract the values of keys in this JSON format:(.*?)from the following:', re.DOTALL)
+    # pattern2 = re.compile(r'If multiple values are found for the same key, list them separated by commas.', re.DOTALL)
+
+    for index, row in df.iterrows():
+        question = row[0]
+
+        answer = row[1]
+        prompt = f'''{question} You are a data extractor for occupier details of a property in India. Using the provided HTML data, extract information to create a JSON array of objects. If you do not find a match, leave the value empty. Translate all numbers in English. Do NOT send the wrong answer. The HTML data is structured in a table format, with each row representing an occupier's details. The fields are 'Account Number', 'Occupier Name', 'Area', 'Land Revenue', 'Barren', and 'Number of mutation', each located within specific table cells. Your task is to parse this HTML and convert it into a JSON array, where each object corresponds to a row in the HTML table. The JSON object should have key-value pairs matching the field names to the extracted data. The final output should strictly follow the format [{},{},...], with each object containing the keys: 'Account Number', 'Occupier Name', 'Area', 'Land Revenue', 'Barren', and 'Number of mutation', each corresponding to their respective data in the HTML.'''
+        data.append(prompt)
+    return data
+
+s=convert_to_conversation(df)
+df = pd.DataFrame(s, columns=['Instruct'])
+
+df.to_csv("712_training_instruct.csv")
diff --git a/712_finetuning/inference.py b/712_finetuning/inference.py
@@ -0,0 +1,18 @@
+from peft import PeftModel, PeftConfig
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer)
+from transformers import pipeline
+
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
+
+repo_id = ''   #adapter repo id 
+
+
+model = PeftModel.from_pretrained(model,repo_id)
+
+prompt = ""
+system_prompt = ""
+pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000)
+result = pipe(f"""Question:{prompt}[INST]{system_prompt}[/INST] Assistant:""")
+print(result[0]['generated_text'])
diff --git a/712_finetuning/mistral_calling.py b/712_finetuning/mistral_calling.py
@@ -0,0 +1,73 @@
+from huggingface_hub import InferenceClient
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer
+)
+from threading import Thread
+from typing import Iterator
+
+class Mistral:
+    def __init__(self):
+        #self.model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+        self.client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
+        self.model = None
+        self.tokenizer = None
+    def format_prompt(self,message):
+        prompt = "<s>"
+        prompt += f"[INST] {message} [/INST]"
+        return prompt
+    def inference(self,prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
+        temperature = float(temperature)
+        if temperature < 1e-2:
+            temperature = 1e-2
+        top_p = float(top_p)
+        generate_kwargs = dict(
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            do_sample=True,
+            seed=42)
+        system_prompt = """You are a data extractor for occupier details of a property in India. Using the provided HTML data, extract information to create a JSON array of objects. If you do not find a match, leave the value empty. Translate all numbers in English. Do NOT send the wrong answer. The HTML data is structured in a table format, with each row representing an occupier's details. The fields are 'Account Number', 'Occupier Name', 'Area', 'Land Revenue', 'Barren', and 'Number of mutation', each located within specific table cells. Your task is to parse this HTML and convert it into a JSON array, where each object corresponds to a row in the HTML table. The JSON object should have key-value pairs matching the field names to the extracted data. The final output should strictly follow the format [{},{},...], with each object containing the keys: 'Account Number', 'Occupier Name', 'Area', 'Land Revenue', 'Barren', and 'Number of mutation', each corresponding to their respective data in the HTML."""
+        prompt = f"""{system_prompt},{prompt}"""
+        formatted_prompt = self.format_prompt(prompt)
+        stream = self.client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+        output = ""
+        for response in stream:
+            output += response.token.text
+        return output
+
+    def model_tokenizer(self):
+        self.model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
+        self.tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
+
+    def generate(self,message: str,max_new_tokens: int = 1024,temperature: float = 0.6,top_p: float = 0.9,top_k: int = 50,repetition_penalty: float = 1.2,) -> Iterator[str]:
+        self.model_tokenizer()
+        conversation = []
+        conversation.append({"role": "user", "content": message})
+        input_ids = self.tokenizer.apply_chat_template(conversation, return_tensors="pt")
+        if input_ids.shape[1] > self.MAX_INPUT_TOKEN_LENGTH:
+            input_ids = input_ids[:, -self.MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input from conversation as it was longer than {self.MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids.to(self.model.device)
+        #print(input_ids)
+        streamer = TextIteratorStreamer(self.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+        generate_kwargs = dict(
+            {"input_ids": input_ids},
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            top_p=top_p,
+            top_k=top_k,
+            temperature=temperature,
+            num_beams=1,
+            repetition_penalty=repetition_penalty,
+        )
+        t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+        t.start()
+        outputs = []
+        for text in streamer:
+            outputs.append(text)
+        return outputs
+
diff --git a/712_finetuning/mistral_finetune_cuda.py b/712_finetuning/mistral_finetune_cuda.py
@@ -0,0 +1,125 @@
+import pandas as pd
+from datasets import Dataset
+from huggingface_hub import login
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
+from peft import LoraConfig
+from transformers import TrainingArguments
+from trl import SFTTrainer
+import os
+import pyarrow as pa
+import tqdm
+
+hf_token=''
+
+login(hf_token)
+
+df = pd.read_csv('712_training_instruct.csv')
+df = df.drop('Unnamed: 0', axis=1)
+
+df = df[df.Instruct != np.nan]
+dataset = Dataset(pa.Table.from_pandas(df))
+
+model_name = "mistralai/Mistral-7B-Instruct-v0.2"
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    trust_remote_code=True
+)
+model.config.use_cache = False
+
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+
+tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+tokenizer.pad_token = tokenizer.unk_token
+tokenizer.clean_up_tokenization_spaces = True
+tokenizer.add_bos_token = False
+tokenizer.padding_side = "right"
+tokenizer.pad_token
+
+lora_alpha = 16
+lora_dropout = 0.1
+lora_r = 64
+
+peft_config = LoraConfig(
+    lora_alpha=lora_alpha,
+    lora_dropout=lora_dropout,
+    r=lora_r,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+    ],
+)
+
+
+output_dir = "./results"
+num_train_epochs = 2
+auto_find_batch_size = True
+gradient_accumulation_steps = 2
+optim = "paged_adamw_32bit"
+save_strategy = "epoch"
+learning_rate = 3e-4
+lr_scheduler_type = "cosine"
+warmup_ratio = 0.03
+logging_strategy = "steps"
+logging_steps = 25
+evaluation_strategy = "no"
+bf16 = True
+
+training_arguments = TrainingArguments(
+    output_dir=output_dir,
+    num_train_epochs=num_train_epochs,
+    auto_find_batch_size=auto_find_batch_size,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    optim=optim,
+    save_strategy=save_strategy,
+    learning_rate=learning_rate,
+    lr_scheduler_type=lr_scheduler_type,
+    warmup_ratio=warmup_ratio,
+    logging_strategy=logging_strategy,
+    logging_steps=logging_steps,
+    evaluation_strategy=evaluation_strategy,
+    bf16=False,
+)
+
+max_seq_length = 512
+
+response_template = "[/INST]"
+print(f"Response template for collator: {response_template}")
+
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset,
+    dataset_text_field="Instruct",
+    #data_collator=collator,
+    peft_config=peft_config,
+    max_seq_length=max_seq_length,
+    tokenizer=tokenizer,
+    args=training_arguments,
+)
+
+# [markdown]
+# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
+
+# %%
+for name, module in trainer.model.named_modules():
+    if "norm" in name:
+        module = module.to(torch.float32)
+
+
+trainer.train()   
+trainer.model.push_to_hub(repo_id)
diff --git a/README copy.md b/README copy.md
@@ -0,0 +1,90 @@
+rr# oneAPI-GenAI-Hackathon-2023 - Hack2Skill
+
+#### Team Name - Team BhuMe
+#### Problem Statement - AI-Enhanced Legal Practice Platform
+#### Team Leader Email - [email protected]
+
+### 📜 Overview
+  This project is part of the Intel OneAPI Hackathon 2023, under the Generative AI Large Language Models Fine Tuned For Legal Practice Platform theme by Team BhuMe.
+  We delved into a project to develop a robust LLM finetuned on Indian property registry and documentation data capable of extracting structured information from unstructured and complex property data. We simplify & speed-up the property legal due-diligence, conducted during the short period of deal-making. For the task at hand, we implemented the recent Mistral-7B model, which is an open-source large language model. Data is downloaded during runtime from publicly available government digital land records. Our work focused on creating real estate investor & broker centric product to fetch ownership data automatically for a given Village & Plot number, and produce into a simple to understand UI. Selenium can be used to automate data download from public records, and finetuned Mistral is used to extract structured information which can be neatly shown on a vector map UI. Mistral model is finetuned on downstream tasks (adapter layer) to improve accuracy and make it less verbose. 
+
+  Further, training was done using Mistral on both Intel's Ipex to perform faster inference & Nvidia T4 GPU for benchmarking purposes. Finally, real time inference will be achieved using Intel_for_pytorch_extension. The inference model is hosted on Intel Developer Cloud using ngrok.
+
+
+### 📜  A Brief of the Prototype:
+  App is available on https://app.bhume.in/
+
+  Brokers and real estate investors use our services to automate and simplify property ownership documentation.
+
+  Lawyers use this tool to automatically download property registry data from government website, and then filter the property of interest based on property schedule containing khasra no., survey no., plot no. and other fields.
+
+  Valuers use this tool to extract sale instances of properties near their area of interest.
+
+
+### Tech Stack: 
+
+   Technologies used to Build the prototype Intel® AI Analytics Toolkits, and it's libraries
+   ![tech_stack](tech_stack.png)
+   We use a mix of react, python, django, postgres and libraries like Selenium, scikit-learn and finetuned LLMs from OpenAI to build and run the app. Data is scrapped and stored for each request during runtime. downloaded data is filtered using document type and then fed into LLMs one by one to extract information which can help us simplify the UI.
+
+### Step-by-Step Code Execution Instructions:
+  This Section must contain a set of instructions required to clone and run the prototype so that it can be tested and deeply analyzed
+
+  Run frontend
+  ### `cd frontend`
+  ### `npm install`
+  ### `npm start`
+
+  Inference model 
+  ### `python Mistral_calling.py`
+
+  Go to http://localhost:3000
+
+
+### Step-by-Step Finetuning 
+  Use the following commands to setup and activate the conda environment
+  ```bash
+  conda create -n venv python==3.8.10
+  conda activate venv
+  install pip install -r requirements.txt
+  ```
+
+  set the env variable to select Intel AMX ISA 
+  ```bash
+  export ONEDNN_MAX_CPU_ISA="AVX512_CORE_AMX"
+  ```
+
+  Preprocessing 
+  Prepare the dataset using preprocess.py
+  ```python preprocess.py```
+
+  Finetuning
+  run the command 
+  ```python falcon-tune.py --bf16 True --use_ipex True --max_seq_length 512```
+
+  Inference
+  ```python falcon-tuned-inference.py --checkpoints <PATH-TO-CHECKPOINT> --max_length 200 --top_k 10```
+
+### 🚩 Benchmarking Results:
+  ![benchmark](benchmark.png)
+  *inference time is measured in seconds
+
+### Future Scope:
+   Write about the scalability and futuristic aspects of the prototype developed
+
+   Property disputes account for 70% of all civil court cases in India. Proper due-diligence before any transaction can help a person avoid legal issues. Barrier to due-diligence currently is data cleaning, processing and analyzing for each property in a short duration of time, while the deal is being negotiated.
+
+   Future scope of work is:
+   1. to integrate other legal documents pertaining to property ownership (depth)
+   2. to expand to other states
+   3. download ownership data for each district one-by-one and make it available to users
+
+### Our Learnings:
+   Write about the scalability and futuristic aspects of the prototype developed
+
+   Property disputes account for 70% of all civil court cases in India. Proper due-diligence before any transaction can help a person avoid legal issues. Barrier to due-diligence currently is data cleaning, processing and analyzing for each property in a short duration of time, while the deal is being negotiated.
+
+   Future scope of work is:
+   1. to integrate other legal documents pertaining to property ownership (depth)
+   2. to expand to other states
+   3. download ownership data for each district one-by-one and make it available to users