Spaces:

IronOne-AI-Labs
/

Annual_Report_Summarization_Demo

Sleeping

App Files Files Community

Lahiru Menikdiwela commited on Nov 26, 2024

Commit

173b5f1

1 Parent(s): ab21bba

changes done according to llama model

Browse files

Files changed (2) hide show

model.py +15 -13
summarizer.py +32 -4

model.py CHANGED Viewed

@@ -19,28 +19,30 @@ def get_local_model(model_name_or_path:str)->pipeline:
     #print(f"Model is running on {device}")
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_name_or_path,
-        token = hf_token
-    )
     model = AutoModelForCausalLM.from_pretrained(
         model_name_or_path,
         torch_dtype=torch.bfloat16,
         # load_in_4bit = True,
         token = hf_token
     )
-    pipe = pipeline(
-        task = "text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        device = device,
-        max_new_tokens = 400,
-        model_kwargs = {"max_length":16384, "max_new_tokens": 512},
-    )
     logger.info(f"Summarization pipeline created and loaded to {device}")
-    return pipe
 def get_endpoint(api_key:str):

     #print(f"Model is running on {device}")
+    #!!!!!Removed for Llama model
+    # tokenizer = AutoTokenizer.from_pretrained(
+    #     model_name_or_path,
+    #     token = hf_token
+    # )
     model = AutoModelForCausalLM.from_pretrained(
         model_name_or_path,
         torch_dtype=torch.bfloat16,
         # load_in_4bit = True,
         token = hf_token
     )
+    #!!!!!!!!!!!!!!!!!!!!!Removed for Llama model!!!!!!!!!!!!!!!!!!!!!!!
+    # pipe = pipeline(
+    #     task = "summarization",
+    #     model=model,
+    #     tokenizer=tokenizer,
+    #     device = device,
+    #     max_new_tokens = 400,
+    #     model_kwargs = {"max_length":16384, "max_new_tokens": 512},
+    # )
     logger.info(f"Summarization pipeline created and loaded to {device}")
+    return model
 def get_endpoint(api_key:str):

summarizer.py CHANGED Viewed

@@ -18,9 +18,13 @@ def summarizer_init(model_name,model_type,api_key=None) -> None:
         return tokenizer,base_summarizer
 def summarizer_summarize(model_type,tokenizer, base_summarizer, text:str,summarizer_type = "map_reduce")->str:
-    prompt = "SUmmarize this by focusing numerical importance sentences dont omit numerical sentences.Include all numerical details input text:"
-    text = prompt+text
-    text_to_summarize,length_type = prepare_for_summarize(text,tokenizer)
     if length_type =="short":
@@ -45,7 +49,31 @@ def summarizer_summarize(model_type,tokenizer, base_summarizer, text:str,summari
         elif model_type == "local":
             pipe = base_summarizer
             start = time.time()
-            summary = pipe(text_to_summarize)[0]['generated_text']
             end = time.time()
             print(f"Summary generation took {round((end-start),2)}s.")
             return summary,round((end-start),2)

         return tokenizer,base_summarizer
 def summarizer_summarize(model_type,tokenizer, base_summarizer, text:str,summarizer_type = "map_reduce")->str:
+    # prompt = "SUmmarize this by focusing numerical importance sentences dont omit numerical sentences.Include all numerical details input text:"
+    text = text
+    #!!!!!!!!!!!!!!!!!!!Removed because map reduce is not suitable or take long time
+    # text_to_summarize,length_type = prepare_for_summarize(text,tokenizer)
+    length_type = "short"
+    text_to_summarize = text
     if length_type =="short":
         elif model_type == "local":
             pipe = base_summarizer
             start = time.time()
+            #!!!!!!!!!!!!!!!!!!!!Changes to llama model
+            input_text = text_to_summarize
+            chat = [
+            { "role": "user",
+            "content": f"""
+            SUmmarize this by focusing numerical importance sentences in the perspective of financial executive. input text: {input_text}
+            """ },
+            ]
+            prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+            inputs = tokenizer(prompt,
+                      return_tensors="pt", truncation=True).to('cuda')
+            attention_mask = inputs["attention_mask"]
+            approximate_tokens = int(len(text)//10)
+            output = base_summarizer.generate(inputs['input_ids'],
+                                    attention_mask = attention_mask,
+                                    top_k=10, max_new_tokens=approximate_tokens,
+                                    pad_token_id = tokenizer.eos_token_id)
+            base_summary = tokenizer.batch_decode(output[:, inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
+            summary = base_summary[0]
+            # summary = pipe(text_to_summarize)[0]['generated_text']
+            #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1Changes finished for llama model
             end = time.time()
             print(f"Summary generation took {round((end-start),2)}s.")
             return summary,round((end-start),2)