Spaces:
Runtime error
Runtime error
File size: 1,868 Bytes
02216c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import argparse
import csv
import json
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
MODEL = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForTokenClassification.from_pretrained(MODEL)
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
def process(*args):
parser = argparse.ArgumentParser()
parser.add_argument('--notes', help='Notes CSV', required=True)
parser.add_argument('--out', help='Output', required=True)
args = parser.parse_args()
filepath = args.notes
outpath = args.out
if not filepath.endswith(".csv"):
raise ValueError("Filepath must be a .csv file.")
if not outpath.endswith(".json"):
raise ValueError("Output path must be a .json file.")
processed = []
with open(filepath, "r") as f:
reader = csv.DictReader(f)
for row in reader:
text = row["text"]
raw = pipe(text)
# do something with `raw` here e.g. save to file
ner_content = {
# "text": text,
"score": row["score"],
"student_id": row["student_id"],
"case": row["case"],
"entities": [
{
"entity": x["entity_group"],
"word": x["word"],
"score": round(float(x["score"]), 2),
"start": x["start"],
"end": x["end"],
}
for x in raw
],
}
processed.append(ner_content)
# write as json to file
with open(outpath, "w") as f:
json.dump(processed, f)
if __name__ == "__main__":
import sys
process(*sys.argv[1:])
|