seanpedrickcase commited on
Commit
85b6613
·
1 Parent(s): 5cdf399

CPU Flan inference is crashing, so trying to revert to previous package versions that worked

Browse files
Files changed (3) hide show
  1. app.py +4 -4
  2. chatfuncs/chatfuncs.py +11 -175
  3. requirements.txt +5 -6
app.py CHANGED
@@ -113,14 +113,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
113
 
114
  if torch_device == "cuda":
115
  if "flan" in model_name:
116
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
117
  else:
118
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
119
  else:
120
  if "flan" in model_name:
121
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
122
  else:
123
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)
124
 
125
  tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
126
 
 
113
 
114
  if torch_device == "cuda":
115
  if "flan" in model_name:
116
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
117
  else:
118
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
119
  else:
120
  if "flan" in model_name:
121
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
122
  else:
123
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)#, torch_dtype=torch.float16)
124
 
125
  tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
126
 
chatfuncs/chatfuncs.py CHANGED
@@ -99,66 +99,17 @@ context_length:int = 2048
99
  sample = True
100
 
101
 
102
- # class CtransInitConfig_gpu:
103
- # def __init__(self, temperature=temperature,
104
- # top_k=top_k,
105
- # top_p=top_p,
106
- # repetition_penalty=repetition_penalty,
107
- # last_n_tokens=last_n_tokens,
108
- # max_new_tokens=max_new_tokens,
109
- # seed=seed,
110
- # reset=reset,
111
- # stream=stream,
112
- # threads=threads,
113
- # batch_size=batch_size,
114
- # context_length=context_length,
115
- # gpu_layers=gpu_layers):
116
- # self.temperature = temperature
117
- # self.top_k = top_k
118
- # self.top_p = top_p
119
- # self.repetition_penalty = repetition_penalty# repetition_penalty
120
- # self.last_n_tokens = last_n_tokens
121
- # self.max_new_tokens = max_new_tokens
122
- # self.seed = seed
123
- # self.reset = reset
124
- # self.stream = stream
125
- # self.threads = threads
126
- # self.batch_size = batch_size
127
- # self.context_length = context_length
128
- # self.gpu_layers = gpu_layers
129
- # # self.stop: list[str] = field(default_factory=lambda: [stop_string])
130
-
131
- # def update_gpu(self, new_value):
132
- # self.gpu_layers = new_value
133
-
134
- # class CtransInitConfig_cpu(CtransInitConfig_gpu):
135
- # def __init__(self):
136
- # super().__init__()
137
- # self.gpu_layers = 0
138
-
139
  class CtransInitConfig_gpu:
140
- def __init__(self, #temperature=temperature,
141
- #top_k=top_k,
142
- #top_p=top_p,
143
- #repetition_penalty=repetition_penalty,
144
  last_n_tokens=last_n_tokens,
145
- #max_new_tokens=max_new_tokens,
146
  seed=seed,
147
- #reset=reset,
148
- #stream=stream,
149
  n_threads=threads,
150
  n_batch=batch_size,
151
  n_ctx=4096,
152
  n_gpu_layers=gpu_layers):
153
- #self.temperature = temperature
154
- #self.top_k = top_k
155
- #self.top_p = top_p
156
- #self.repetition_penalty = repetition_penalty# repetition_penalty
157
  self.last_n_tokens = last_n_tokens
158
- #self.max_new_tokens = max_new_tokens
159
  self.seed = seed
160
- #self.reset = reset
161
- #self.stream = stream
162
  self.n_threads = n_threads
163
  self.n_batch = n_batch
164
  self.n_ctx = n_ctx
@@ -177,51 +128,22 @@ gpu_config = CtransInitConfig_gpu()
177
  cpu_config = CtransInitConfig_cpu()
178
 
179
 
180
- # class CtransGenGenerationConfig:
181
- # def __init__(self, temperature=temperature,
182
- # top_k=top_k,
183
- # top_p=top_p,
184
- # repetition_penalty=repetition_penalty,
185
- # last_n_tokens=last_n_tokens,
186
- # seed=seed,
187
- # threads=threads,
188
- # batch_size=batch_size,
189
- # reset=True
190
- # ):
191
- # self.temperature = temperature
192
- # self.top_k = top_k
193
- # self.top_p = top_p
194
- # self.repetition_penalty = repetition_penalty# repetition_penalty
195
- # self.last_n_tokens = last_n_tokens
196
- # self.seed = seed
197
- # self.threads = threads
198
- # self.batch_size = batch_size
199
- # self.reset = reset
200
-
201
  class CtransGenGenerationConfig:
202
  def __init__(self, temperature=temperature,
203
  top_k=top_k,
204
  top_p=top_p,
205
  repeat_penalty=repetition_penalty,
206
- #last_n_tokens=last_n_tokens,
207
  seed=seed,
208
  stream=stream,
209
  max_tokens=max_new_tokens
210
- #threads=threads,
211
- #batch_size=batch_size,
212
- #reset=True
213
  ):
214
  self.temperature = temperature
215
  self.top_k = top_k
216
  self.top_p = top_p
217
  self.repeat_penalty = repeat_penalty
218
- #self.last_n_tokens = last_n_tokens
219
  self.seed = seed
220
  self.max_tokens=max_tokens
221
  self.stream = stream
222
- #self.threads = threads
223
- #self.batch_size = batch_size
224
- #self.reset = reset
225
 
226
  def update_temp(self, new_value):
227
  self.temperature = new_value
@@ -417,93 +339,6 @@ def create_full_prompt(user_input, history, extracted_memory, vectorstore, embed
417
  return history, docs_content_string, instruction_prompt_out
418
 
419
  # Chat functions
420
- # def produce_streaming_answer_chatbot(history, full_prompt, model_type,
421
- # temperature=temperature,
422
- # max_new_tokens=max_new_tokens,
423
- # sample=sample,
424
- # repetition_penalty=repetition_penalty,
425
- # top_p=top_p,
426
- # top_k=top_k
427
- # ):
428
- # #print("Model type is: ", model_type)
429
-
430
- # #if not full_prompt.strip():
431
- # # if history is None:
432
- # # history = []
433
-
434
- # # return history
435
-
436
- # if model_type == "Flan Alpaca (small, fast)":
437
- # # Get the model and tokenizer, and tokenize the user text.
438
- # model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
439
-
440
- # # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
441
- # # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
442
- # streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
443
- # generate_kwargs = dict(
444
- # model_inputs,
445
- # streamer=streamer,
446
- # max_new_tokens=max_new_tokens,
447
- # do_sample=sample,
448
- # repetition_penalty=repetition_penalty,
449
- # top_p=top_p,
450
- # temperature=temperature,
451
- # top_k=top_k
452
- # )
453
-
454
- # print(generate_kwargs)
455
-
456
- # t = Thread(target=model.generate, kwargs=generate_kwargs)
457
- # t.start()
458
-
459
- # # Pull the generated text from the streamer, and update the model output.
460
- # start = time.time()
461
- # NUM_TOKENS=0
462
- # print('-'*4+'Start Generation'+'-'*4)
463
-
464
- # history[-1][1] = ""
465
- # for new_text in streamer:
466
- # if new_text == None: new_text = ""
467
- # history[-1][1] += new_text
468
- # NUM_TOKENS+=1
469
- # yield history
470
-
471
- # time_generate = time.time() - start
472
- # print('\n')
473
- # print('-'*4+'End Generation'+'-'*4)
474
- # print(f'Num of generated tokens: {NUM_TOKENS}')
475
- # print(f'Time for complete generation: {time_generate}s')
476
- # print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
477
- # print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
478
-
479
- # elif model_type == "Mistral Open Orca (larger, slow)":
480
- # tokens = model.tokenize(full_prompt)
481
-
482
- # gen_config = CtransGenGenerationConfig()
483
- # gen_config.update_temp(temperature)
484
-
485
- # print(vars(gen_config))
486
-
487
- # # Pull the generated text from the streamer, and update the model output.
488
- # start = time.time()
489
- # NUM_TOKENS=0
490
- # print('-'*4+'Start Generation'+'-'*4)
491
-
492
- # history[-1][1] = ""
493
- # for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
494
- # if new_text == None: new_text = ""
495
- # history[-1][1] += model.detokenize(new_text) #new_text
496
- # NUM_TOKENS+=1
497
- # yield history
498
-
499
- # time_generate = time.time() - start
500
- # print('\n')
501
- # print('-'*4+'End Generation'+'-'*4)
502
- # print(f'Num of generated tokens: {NUM_TOKENS}')
503
- # print(f'Time for complete generation: {time_generate}s')
504
- # print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
505
- # print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
506
-
507
 
508
  def produce_streaming_answer_chatbot(history, full_prompt, model_type,
509
  temperature=temperature,
@@ -523,8 +358,8 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
523
 
524
  if model_type == "Flan Alpaca (small, fast)":
525
  # Get the model and tokenizer, and tokenize the user text.
526
- model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
527
-
528
  # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
529
  # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
530
  streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
@@ -551,10 +386,13 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
551
 
552
  history[-1][1] = ""
553
  for new_text in streamer:
554
- if new_text == None: new_text = ""
555
- history[-1][1] += new_text
556
- NUM_TOKENS+=1
557
- yield history
 
 
 
558
 
559
  time_generate = time.time() - start
560
  print('\n')
@@ -567,8 +405,6 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
567
  elif model_type == "Mistral Open Orca (larger, slow)":
568
  #tokens = model.tokenize(full_prompt)
569
 
570
- temp = ""
571
-
572
  gen_config = CtransGenGenerationConfig()
573
  gen_config.update_temp(temperature)
574
 
 
99
  sample = True
100
 
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  class CtransInitConfig_gpu:
103
+ def __init__(self,
 
 
 
104
  last_n_tokens=last_n_tokens,
 
105
  seed=seed,
 
 
106
  n_threads=threads,
107
  n_batch=batch_size,
108
  n_ctx=4096,
109
  n_gpu_layers=gpu_layers):
110
+
 
 
 
111
  self.last_n_tokens = last_n_tokens
 
112
  self.seed = seed
 
 
113
  self.n_threads = n_threads
114
  self.n_batch = n_batch
115
  self.n_ctx = n_ctx
 
128
  cpu_config = CtransInitConfig_cpu()
129
 
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  class CtransGenGenerationConfig:
132
  def __init__(self, temperature=temperature,
133
  top_k=top_k,
134
  top_p=top_p,
135
  repeat_penalty=repetition_penalty,
 
136
  seed=seed,
137
  stream=stream,
138
  max_tokens=max_new_tokens
 
 
 
139
  ):
140
  self.temperature = temperature
141
  self.top_k = top_k
142
  self.top_p = top_p
143
  self.repeat_penalty = repeat_penalty
 
144
  self.seed = seed
145
  self.max_tokens=max_tokens
146
  self.stream = stream
 
 
 
147
 
148
  def update_temp(self, new_value):
149
  self.temperature = new_value
 
339
  return history, docs_content_string, instruction_prompt_out
340
 
341
  # Chat functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
  def produce_streaming_answer_chatbot(history, full_prompt, model_type,
344
  temperature=temperature,
 
358
 
359
  if model_type == "Flan Alpaca (small, fast)":
360
  # Get the model and tokenizer, and tokenize the user text.
361
+ model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device)
362
+
363
  # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
364
  # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
365
  streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
 
386
 
387
  history[-1][1] = ""
388
  for new_text in streamer:
389
+ try:
390
+ if new_text == None: new_text = ""
391
+ history[-1][1] += new_text
392
+ NUM_TOKENS+=1
393
+ yield history
394
+ except Exception as e:
395
+ print(f"Error during text generation: {e}")
396
 
397
  time_generate = time.time() - start
398
  print('\n')
 
405
  elif model_type == "Mistral Open Orca (larger, slow)":
406
  #tokens = model.tokenize(full_prompt)
407
 
 
 
408
  gen_config = CtransGenGenerationConfig()
409
  gen_config.update_temp(temperature)
410
 
requirements.txt CHANGED
@@ -2,15 +2,14 @@ langchain
2
  langchain-community
3
  beautifulsoup4
4
  pandas
5
- transformers
6
  llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
7
- #torch \
8
- #--extra-index-url https://download.pytorch.org/whl/cu121
9
- sentence_transformers
10
- faiss-cpu
11
  pypdf
12
  python-docx
13
- #ctransformers[cuda]
14
  keybert
15
  span_marker
16
  gensim
 
2
  langchain-community
3
  beautifulsoup4
4
  pandas
5
+ transformers==4.34.0
6
  llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
7
+ torch \
8
+ --extra-index-url https://download.pytorch.org/whl/cu121
9
+ sentence_transformers==2.2.2
10
+ faiss-cpu==1.7.4
11
  pypdf
12
  python-docx
 
13
  keybert
14
  span_marker
15
  gensim