kwabs22 commited on
Commit
43647c4
·
1 Parent(s): 3dc2230

RAG Placeholder demo test

Browse files
Files changed (3) hide show
  1. README.md +2 -0
  2. app.py +144 -12
  3. requirements.txt +2 -1
README.md CHANGED
@@ -7,6 +7,8 @@ sdk: gradio
7
  sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
10
+ models:
11
+ - Qwen/Qwen2-0.5B-Instruct
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -10,12 +10,80 @@ import torch
10
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
  from threading import Thread
12
  import time
13
-
14
- # Initialize the zero tensor on CUDA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  zero = torch.Tensor([0]).cuda()
16
  print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
17
 
18
- # Load the model and tokenizer
 
 
 
19
  llmguide_model = AutoModelForCausalLM.from_pretrained(
20
  "Qwen/Qwen2-0.5B-Instruct",
21
  torch_dtype="auto",
@@ -23,6 +91,29 @@ llmguide_model = AutoModelForCausalLM.from_pretrained(
23
  )
24
  llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  @spaces.GPU
27
  def llmguide_generate_response(prompt, stream=False):
28
  print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
@@ -58,7 +149,10 @@ def llmguide_generate_response(prompt, stream=False):
58
  total_tokens += 1
59
  current_time = time.time()
60
  tokens_per_second = total_tokens / (current_time - start_time)
61
- yield generated_text, f"{tokens_per_second:.2f}"
 
 
 
62
  else:
63
  generated_ids = llmguide_model.generate(
64
  model_inputs.input_ids,
@@ -71,7 +165,19 @@ def llmguide_generate_response(prompt, stream=False):
71
  total_tokens = len(generated_ids[0])
72
  end_time = time.time()
73
  tokens_per_second = total_tokens / (end_time - start_time)
74
- yield response, f"{tokens_per_second:.2f}"
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
  #--------------------------------------------------------------------------------------------------------------------------------
@@ -718,12 +824,25 @@ with gr.Blocks() as demo:
718
  with gr.Accordion("Qwen 0.5B as Space Guide Tests", open=False):
719
  gr.HTML("Placeholder for FAQ type - front end as prompt engineering for the first message to force direction of conversion")
720
  gr.HTML("Placeholder for weak RAG Type - Explanations through an opensource embeddings engine")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
  ("Placeholder for https://huggingface.co/h2oai/h2o-danube3-500m-chat-GGUF as alternative")
722
  ("Placeholder for qwen 2 72b as alternative use checkbox and gradio client api call")
723
  gr.Markdown("# Qwen-0.5B-Instruct Language Model")
724
  gr.Markdown("This demo uses the Qwen-0.5B-Instruct model to generate responses based on your input.")
725
- gr.HTML("Example prompt: <br>I am writing a story about a chef. please write dishes to appear on the menu. <br>What are the most common decisions that a chef story would include?")
726
- gr.HTML("Continue this config - Paste any complete block of the config")
727
 
728
  with gr.Row():
729
  with gr.Column():
@@ -976,7 +1095,7 @@ Creating more diverse paths through the game""")
976
 
977
  with gr.Tab("Main areas of considerations"):
978
  with gr.Tab("Mermaid Graphs and Nesting"):
979
- gr.HTML("Claude Artifacts to illustrate <br> Nsted Structure - https://claude.site/artifacts/4a910d81-1541-49f4-8531-4f27fe56cd1e <br> https://claude.site/artifacts/265e9242-2093-46e1-9011-ed6ad938be90?fullscreen=false <br> ")
980
  gr.HTML("")
981
  with gr.Tab(""):
982
  gr.HTML("")
@@ -1713,7 +1832,7 @@ Would you like me to elaborate on any of these ideas or show how to implement th
1713
  ewpgenerate_button.click(generate_story_and_timeline, inputs=[ewpgenerate_no_ui_timeline_points, ewpgenerate_no_media_timeline_points, ewpgenerate_with_media_check], outputs=[ewptimeline_output_with_assets, ewptimeline_output, ewpstory_output, ewpwacustom_config, ewpgame_structure_output_text]) #ewpgame_structure_output_text_with_media, ewpgame_structure_output_text])
1714
 
1715
  with gr.Tab("Asset Generation Considerations"):
1716
- gr.HTML("With some asset category ideas from gemini-1.5-flash-api-0514 and reka-flash-preview-20240611 <br><br>Licenses for the spaces still to be evaluated - June 2024")
1717
 
1718
  with gr.Accordion("LLM HF Spaces/Sites (Click Here to Open) - Ask for a story and suggestions based on the autoconfig", open=False):
1719
  with gr.Row():
@@ -1732,17 +1851,30 @@ Would you like me to elaborate on any of these ideas or show how to implement th
1732
  with gr.Tab("Maths"):
1733
  gr.HTML("https://huggingface.co/spaces/AI-MO/math-olympiad-solver")
1734
 
 
 
 
 
 
 
 
 
 
 
 
 
1735
  with gr.Tab("Images"):
1736
  with gr.Accordion("Image Gen or Animation HF Spaces/Sites (Click Here to Open) - Have to download and upload at the the top", open=False):
1737
  # with gr.Tabs("General"):
1738
  with gr.Row():
1739
- linktoimagegen = gr.Dropdown(choices=["--General--", "https://pixart-alpha-pixart-sigma.hf.space", "https://stabilityai-stable-diffusion-3-medium.hf.space", "https://gokaygokay-kolors.hf.space", "https://prodia-sdxl-stable-diffusion-xl.hf.space", "https://prodia-fast-stable-diffusion.hf.space", "https://bytedance-hyper-sdxl-1step-t2i.hf.space", "https://multimodalart-cosxl.hf.space", "https://cagliostrolab-animagine-xl-3-1.hf.space", "https://stabilityai-stable-diffusion.hf.space",
 
1740
  "--Speed--", "https://radames-real-time-text-to-image-sdxl-lightning.hf.space", "https://ap123-sdxl-lightning.hf.space",
1741
  "--LORA Support--", "https://artificialguybr-artificialguybr-demo-lora.hf.space", "https://artificialguybr-studio-ghibli-lora-sdxl.hf.space", "https://artificialguybr-pixel-art-generator.hf.space", "https://fffiloni-sdxl-control-loras.hf.space", "https://ehristoforu-dalle-3-xl-lora-v2.hf.space",
1742
  "--Image to Image--", "https://lllyasviel-ic-light.hf.space", "https://gparmar-img2img-turbo-sketch.hf.space",
1743
  "--Control of Pose--", "https://instantx-instantid.hf.space", "https://modelscope-transferanything.hf.space", "https://okaris-omni-zero.hf.space"
1744
  "--Control of Shapes--", "https://linoyts-scribble-sdxl-flash.hf.space",
1745
- "--Foreign Language Input--", ""], label="Choose/Cancel type any .hf.space link here (can also type a link)'", allow_custom_value=True)
1746
  imagegenspacebtn = gr.Button("Use the chosen URL to load interface with a image generation model")
1747
 
1748
  imagegenspace = gr.HTML("Image Space Chosen will load here")
@@ -1784,7 +1916,7 @@ Would you like me to elaborate on any of these ideas or show how to implement th
1784
  gr.HTML("Placeholder for models small enough to run on cpu here in this space that can assist")
1785
 
1786
  with gr.Tab("Audio"):
1787
- with gr.Accordion("3D Model Spaces/Sites (Click Here to Open) - Image to Blender?", open=False):
1788
  with gr.Row():
1789
  linktoaudiiogenspace = gr.Dropdown(choices=["General", "https://artificialguybr-stable-audio-open-zero.hf.space", "",
1790
  "--Talking Portrait--","https://fffiloni-tts-hallo-talking-portrait.hf.space"],
 
10
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
  from threading import Thread
12
  import time
13
+ import psutil
14
+ from sentence_transformers import SentenceTransformer
15
+
16
+ # # Initialize the zero tensor on CUDA
17
+ # zero = torch.Tensor([0]).cuda()
18
+ # print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
19
+
20
+ # # Load the model and tokenizer
21
+ # llmguide_model = AutoModelForCausalLM.from_pretrained(
22
+ # "Qwen/Qwen2-0.5B-Instruct",
23
+ # torch_dtype="auto",
24
+ # device_map="auto"
25
+ # )
26
+ # llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
27
+
28
+ # @spaces.GPU
29
+ # def llmguide_generate_response(prompt, stream=False):
30
+ # print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
31
+
32
+ # messages = [
33
+ # {"role": "system", "content": "You are a helpful assistant."},
34
+ # {"role": "user", "content": prompt}
35
+ # ]
36
+ # text = llmguide_tokenizer.apply_chat_template(
37
+ # messages,
38
+ # tokenize=False,
39
+ # add_generation_prompt=True
40
+ # )
41
+ # model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(zero.device)
42
+
43
+ # start_time = time.time()
44
+ # total_tokens = 0
45
+
46
+ # if stream:
47
+ # streamer = TextIteratorStreamer(llmguide_tokenizer, skip_special_tokens=True)
48
+ # generation_kwargs = dict(
49
+ # model_inputs,
50
+ # streamer=streamer,
51
+ # max_new_tokens=512,
52
+ # temperature=0.7,
53
+ # )
54
+ # thread = Thread(target=llmguide_model.generate, kwargs=generation_kwargs)
55
+ # thread.start()
56
+
57
+ # generated_text = ""
58
+ # for new_text in streamer:
59
+ # generated_text += new_text
60
+ # total_tokens += 1
61
+ # current_time = time.time()
62
+ # tokens_per_second = total_tokens / (current_time - start_time)
63
+ # yield generated_text, f"{tokens_per_second:.2f}"
64
+ # else:
65
+ # generated_ids = llmguide_model.generate(
66
+ # model_inputs.input_ids,
67
+ # max_new_tokens=512
68
+ # )
69
+ # generated_ids = [
70
+ # output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
71
+ # ]
72
+ # response = llmguide_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
73
+ # total_tokens = len(generated_ids[0])
74
+ # end_time = time.time()
75
+ # tokens_per_second = total_tokens / (end_time - start_time)
76
+ # yield response, f"{tokens_per_second:.2f}"
77
+
78
+
79
+ # Initialize GPU tensor
80
  zero = torch.Tensor([0]).cuda()
81
  print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
82
 
83
+ # Load the embedding model
84
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
85
+
86
+ # Load the Qwen model and tokenizer
87
  llmguide_model = AutoModelForCausalLM.from_pretrained(
88
  "Qwen/Qwen2-0.5B-Instruct",
89
  torch_dtype="auto",
 
91
  )
92
  llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
93
 
94
+ # Sample knowledge base (replace with your own data)
95
+ knowledge_base = [
96
+ "The capital of France is Paris.",
97
+ "Python is a popular programming language.",
98
+ "Machine learning is a subset of artificial intelligence.",
99
+ "The Earth orbits around the Sun.",
100
+ ]
101
+
102
+ # Create embeddings for the knowledge base
103
+ knowledge_base_embeddings = embedding_model.encode(knowledge_base)
104
+
105
+ def retrieve(query, k=2):
106
+ query_embedding = embedding_model.encode([query])
107
+ similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
108
+ top_k_indices = similarities.argsort(descending=True)[:k]
109
+ return [knowledge_base[i] for i in top_k_indices]
110
+
111
+ def get_resource_usage():
112
+ ram_usage = psutil.virtual_memory().percent
113
+ gpu_memory_allocated = torch.cuda.memory_allocated() / (1024 ** 3) # Convert to GB
114
+ gpu_memory_reserved = torch.cuda.memory_reserved() / (1024 ** 3) # Convert to GB
115
+ return f"RAM Usage: {ram_usage:.2f}%, GPU Memory Allocated: {gpu_memory_allocated:.2f}GB, GPU Memory Reserved: {gpu_memory_reserved:.2f}GB"
116
+
117
  @spaces.GPU
118
  def llmguide_generate_response(prompt, stream=False):
119
  print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
 
149
  total_tokens += 1
150
  current_time = time.time()
151
  tokens_per_second = total_tokens / (current_time - start_time)
152
+ yield generated_text, f"{tokens_per_second:.2f}", ""
153
+
154
+ resource_usage = get_resource_usage()
155
+ yield generated_text, f"{tokens_per_second:.2f}", resource_usage
156
  else:
157
  generated_ids = llmguide_model.generate(
158
  model_inputs.input_ids,
 
165
  total_tokens = len(generated_ids[0])
166
  end_time = time.time()
167
  tokens_per_second = total_tokens / (end_time - start_time)
168
+ resource_usage = get_resource_usage()
169
+ yield response, f"{tokens_per_second:.2f}", resource_usage
170
+
171
+ # Clear CUDA cache
172
+ # torch.cuda.empty_cache()
173
+ # gc.collect()
174
+
175
+ def rag(query, stream=False):
176
+ retrieved_docs = retrieve(query)
177
+ context = " ".join(retrieved_docs)
178
+ prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
179
+ return llmguide_generate_response(prompt, stream)
180
+
181
 
182
 
183
  #--------------------------------------------------------------------------------------------------------------------------------
 
824
  with gr.Accordion("Qwen 0.5B as Space Guide Tests", open=False):
825
  gr.HTML("Placeholder for FAQ type - front end as prompt engineering for the first message to force direction of conversion")
826
  gr.HTML("Placeholder for weak RAG Type - Explanations through an opensource embeddings engine")
827
+ gr.Interface(
828
+ fn=rag,
829
+ inputs=[
830
+ gr.Textbox(lines=2, placeholder="Enter your question here..."),
831
+ gr.Checkbox(label="Stream output")
832
+ ],
833
+ outputs=[
834
+ gr.Textbox(label="Generated Response"),
835
+ gr.Textbox(label="Tokens per second"),
836
+ gr.Textbox(label="Resource Usage")
837
+ ],
838
+ title="RAG Q&A System with GPU Acceleration and Resource Monitoring",
839
+ description="Ask a question and get an answer based on the retrieved context. The response is generated using a GPU-accelerated model. Resource usage is logged at the end of generation."
840
+ )
841
  ("Placeholder for https://huggingface.co/h2oai/h2o-danube3-500m-chat-GGUF as alternative")
842
  ("Placeholder for qwen 2 72b as alternative use checkbox and gradio client api call")
843
  gr.Markdown("# Qwen-0.5B-Instruct Language Model")
844
  gr.Markdown("This demo uses the Qwen-0.5B-Instruct model to generate responses based on your input.")
845
+ gr.HTML("Example prompts: <br>I am writing a story about a chef. please write dishes to appear on the menu. <br>What are the most common decisions that a chef story would include? <br>What are the kinds problems that a chef story would include? <br>What are the kinds of out of reach goals that a chef story would include? <br>Continue this config - Paste any complete block of the config")
 
846
 
847
  with gr.Row():
848
  with gr.Column():
 
1095
 
1096
  with gr.Tab("Main areas of considerations"):
1097
  with gr.Tab("Mermaid Graphs and Nesting"):
1098
+ gr.HTML("Claude Artifacts to illustrate nested structure brainstorms - <br> https://claude.site/artifacts/4a910d81-1541-49f4-8531-4f27fe56cd1e <br> https://claude.site/artifacts/265e9242-2093-46e1-9011-ed6ad938be90?fullscreen=false <br> ")
1099
  gr.HTML("")
1100
  with gr.Tab(""):
1101
  gr.HTML("")
 
1832
  ewpgenerate_button.click(generate_story_and_timeline, inputs=[ewpgenerate_no_ui_timeline_points, ewpgenerate_no_media_timeline_points, ewpgenerate_with_media_check], outputs=[ewptimeline_output_with_assets, ewptimeline_output, ewpstory_output, ewpwacustom_config, ewpgame_structure_output_text]) #ewpgame_structure_output_text_with_media, ewpgame_structure_output_text])
1833
 
1834
  with gr.Tab("Asset Generation Considerations"):
1835
+ gr.HTML("With some asset category ideas from gemini-1.5-flash-api-0514 and reka-flash-preview-20240611 <br><br>Licenses for the spaces still to be evaluated - June 2024 <br> Users to follow with cool spaces - https://huggingface.co/fffiloni, https://huggingface.co/artificialguybr, https://huggingface.co/radames, https://huggingface.co/multimodalart, ")
1836
 
1837
  with gr.Accordion("LLM HF Spaces/Sites (Click Here to Open) - Ask for a story and suggestions based on the autoconfig", open=False):
1838
  with gr.Row():
 
1851
  with gr.Tab("Maths"):
1852
  gr.HTML("https://huggingface.co/spaces/AI-MO/math-olympiad-solver")
1853
 
1854
+ with gr.Tab("Media Understanding"):
1855
+ gr.HTML("NPC Response Engines? Camera, Shopkeeper, Companion, Enemies, etc.")
1856
+ with gr.Accordion("Media understanding model Spaces/Sites (Click Here to Open) - Image to Blender?", open=False):
1857
+ with gr.Row():
1858
+ linktomediaunderstandingspace = gr.Dropdown(choices=[ "--Weak Audio Understanding = Audio to text, Weak Video Understanding = Video to Image to Image Understanding", "https://skalskip-florence-2-video.hf.space", "https://kingnish-opengpt-4o.hf.space",
1859
+ "--Image Understanding--", "https://qnguyen3-nanollava.hf.space", "https://skalskip-better-florence-2.hf.space", ],
1860
+ label="Choose/Cancel type any .hf.space link here (can also type a link)'", allow_custom_value=True)
1861
+ mediaunderstandingspacebtn = gr.Button("Use the chosen URL to load interface with a media understanding space")
1862
+ mediaunderstandingspace = gr.HTML("Audio Space Chosen will load here")
1863
+ mediaunderstandingspacebtn.click(display_website, inputs=linktomediaunderstandingspace, outputs=mediaunderstandingspace)
1864
+
1865
+
1866
  with gr.Tab("Images"):
1867
  with gr.Accordion("Image Gen or Animation HF Spaces/Sites (Click Here to Open) - Have to download and upload at the the top", open=False):
1868
  # with gr.Tabs("General"):
1869
  with gr.Row():
1870
+ linktoimagegen = gr.Dropdown(choices=["Text-Interleaved", "https://ethanchern-anole.hf.space",
1871
+ "--General--", "https://pixart-alpha-pixart-sigma.hf.space", "https://stabilityai-stable-diffusion-3-medium.hf.space", "https://prodia-sdxl-stable-diffusion-xl.hf.space", "https://prodia-fast-stable-diffusion.hf.space", "https://bytedance-hyper-sdxl-1step-t2i.hf.space", "https://multimodalart-cosxl.hf.space", "https://cagliostrolab-animagine-xl-3-1.hf.space", "https://stabilityai-stable-diffusion.hf.space",
1872
  "--Speed--", "https://radames-real-time-text-to-image-sdxl-lightning.hf.space", "https://ap123-sdxl-lightning.hf.space",
1873
  "--LORA Support--", "https://artificialguybr-artificialguybr-demo-lora.hf.space", "https://artificialguybr-studio-ghibli-lora-sdxl.hf.space", "https://artificialguybr-pixel-art-generator.hf.space", "https://fffiloni-sdxl-control-loras.hf.space", "https://ehristoforu-dalle-3-xl-lora-v2.hf.space",
1874
  "--Image to Image--", "https://lllyasviel-ic-light.hf.space", "https://gparmar-img2img-turbo-sketch.hf.space",
1875
  "--Control of Pose--", "https://instantx-instantid.hf.space", "https://modelscope-transferanything.hf.space", "https://okaris-omni-zero.hf.space"
1876
  "--Control of Shapes--", "https://linoyts-scribble-sdxl-flash.hf.space",
1877
+ "--Foreign Language Input--", "https://gokaygokay-kolors.hf.space"], label="Choose/Cancel type any .hf.space link here (can also type a link)'", allow_custom_value=True)
1878
  imagegenspacebtn = gr.Button("Use the chosen URL to load interface with a image generation model")
1879
 
1880
  imagegenspace = gr.HTML("Image Space Chosen will load here")
 
1916
  gr.HTML("Placeholder for models small enough to run on cpu here in this space that can assist")
1917
 
1918
  with gr.Tab("Audio"):
1919
+ with gr.Accordion("Audio Spaces/Sites (Click Here to Open) - Image to Blender?", open=False):
1920
  with gr.Row():
1921
  linktoaudiiogenspace = gr.Dropdown(choices=["General", "https://artificialguybr-stable-audio-open-zero.hf.space", "",
1922
  "--Talking Portrait--","https://fffiloni-tts-hallo-talking-portrait.hf.space"],
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  transformers
2
- accelerate
 
 
1
  transformers
2
+ accelerate
3
+ sentence-transformers