oleksandrfluxon commited on
Commit
acbf23c
·
1 Parent(s): a8c0df9

Create pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +78 -0
pipeline.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import transformers
3
+ from accelerate import dispatch_model, infer_auto_device_map
4
+ from accelerate.utils import get_balanced_memory
5
+ from typing import Dict, List, Any
6
+
7
+ class PreTrainedPipeline():
8
+ def __init__(self, path=""):
9
+ path = "oleksandrfluxon/mpt-30b-chat-test"
10
+ print("===> path", path)
11
+
12
+ with torch.autocast('cuda'):
13
+ config = transformers.AutoConfig.from_pretrained(
14
+ path,
15
+ trust_remote_code=True
16
+ )
17
+ # config.attn_config['attn_impl'] = 'triton'
18
+ config.init_device = 'cuda:0' # For fast initialization directly on GPU!
19
+ config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
20
+
21
+ print("===> loading model")
22
+ model = transformers.AutoModelForCausalLM.from_pretrained(
23
+ path,
24
+ config=config,
25
+ # torch_dtype=torch.bfloat16, # Load model weights in bfloat16
26
+ torch_dtype=torch.float16,
27
+ trust_remote_code=True,
28
+ # device_map="auto",
29
+ # load_in_8bit=True # Load model in the lowest 4-bit precision quantization
30
+ )
31
+ # model.to('cuda')
32
+ print("===> model loaded")
33
+
34
+ # removed device_map="auto"
35
+ tokenizer = transformers.AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
36
+
37
+
38
+ max_memory = get_balanced_memory(
39
+ model,
40
+ max_memory=None,
41
+ no_split_module_classes=["MPTBlock"],
42
+ dtype='float16',
43
+ low_zero=False
44
+ )
45
+
46
+ device_map = infer_auto_device_map(
47
+ model,
48
+ max_memory=max_memory,
49
+ no_split_module_classes=["MPTBlock"],
50
+ dtype='float16'
51
+ )
52
+ model = dispatch_model(model, device_map=device_map)
53
+
54
+
55
+ # device='cuda:0'
56
+ self.pipeline = transformers.pipeline('text-generation', model=model, tokenizer=tokenizer)
57
+ print("===> init finished")
58
+
59
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
60
+ """
61
+ data args:
62
+ inputs (:obj: `str`)
63
+ parameters (:obj: `str`)
64
+ Return:
65
+ A :obj:`str`: todo
66
+ """
67
+ # get inputs
68
+ inputs = data.pop("inputs",data)
69
+ parameters = data.pop("parameters", {})
70
+ date = data.pop("date", None)
71
+ print("===> inputs", inputs)
72
+ print("===> parameters", parameters)
73
+
74
+ with torch.autocast('cuda'):
75
+ result = self.pipeline(inputs, **parameters)
76
+ print("===> result", result)
77
+
78
+ return result