flexibility for cpu or cuda ep
Browse files- handler.py +13 -2
handler.py
CHANGED
@@ -11,15 +11,26 @@ if torch.backends.cudnn.is_available():
|
|
11 |
|
12 |
class EndpointHandler():
|
13 |
def __init__(self, path=""):
|
|
|
|
|
14 |
# load the optimized model
|
|
|
|
|
|
|
|
|
|
|
15 |
model = ORTModelForSequenceClassification.from_pretrained(
|
16 |
path,
|
17 |
export=False,
|
18 |
-
provider=
|
19 |
)
|
20 |
tokenizer = AutoTokenizer.from_pretrained(path)
|
|
|
|
|
|
|
|
|
21 |
# create inference pipeline
|
22 |
-
self.pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, device=
|
23 |
|
24 |
|
25 |
def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
|
|
|
11 |
|
12 |
class EndpointHandler():
|
13 |
def __init__(self, path=""):
|
14 |
+
|
15 |
+
on_cuda = torch.cuda.is_available()
|
16 |
# load the optimized model
|
17 |
+
|
18 |
+
provider = "CPUExecutionProvider"
|
19 |
+
if on_cuda:
|
20 |
+
provider = "CUDAExecutionProvider"
|
21 |
+
|
22 |
model = ORTModelForSequenceClassification.from_pretrained(
|
23 |
path,
|
24 |
export=False,
|
25 |
+
provider=provider,
|
26 |
)
|
27 |
tokenizer = AutoTokenizer.from_pretrained(path)
|
28 |
+
|
29 |
+
device = -1
|
30 |
+
if on_cuda:
|
31 |
+
device = 0
|
32 |
# create inference pipeline
|
33 |
+
self.pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)
|
34 |
|
35 |
|
36 |
def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
|