parser / udpipe2 /udpipe2_client.py
anasampa2's picture
Upload 110 files
0e5da39 verified
raw
history blame
5.29 kB
#!/usr/bin/env python3
# This file is part of UDPipe 2 <http://github.com/ufal/udpipe>.
#
# Copyright 2022 Institute of Formal and Applied Linguistics, Faculty of
# Mathematics and Physics, Charles University in Prague, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import argparse
import email.mime.multipart
import email.mime.nonmultipart
import email.policy
import json
import os
import sys
import urllib.error
import urllib.request
__version__ = "2.1.1-dev"
def perform_request(server, method, params={}):
if not params:
request_headers, request_data = {}, None
else:
message = email.mime.multipart.MIMEMultipart("form-data", policy=email.policy.HTTP)
for name, value in params.items():
payload = email.mime.nonmultipart.MIMENonMultipart("text", "plain")
payload.add_header("Content-Disposition", "form-data; name=\"{}\"".format(name))
payload.add_header("Content-Transfer-Encoding", "8bit")
payload.set_payload(value, charset="utf-8")
message.attach(payload)
request_data = message.as_bytes().split(b"\r\n\r\n", maxsplit=1)[1]
request_headers = {"Content-Type": message["Content-Type"]}
try:
with urllib.request.urlopen(urllib.request.Request(
url="{}/{}".format(server, method), headers=request_headers, data=request_data
)) as request:
return json.loads(request.read())
except urllib.error.HTTPError as e:
print("An exception was raised during UDPipe 'process' REST request.\n"
"The service returned the following error:\n"
" {}".format(e.fp.read().decode("utf-8")), file=sys.stderr)
raise
except json.JSONDecodeError as e:
print("Cannot parse the JSON response of UDPipe 'process' REST request.\n"
" {}".format(e.msg), file=sys.stderr)
raise
def list_models(args):
response = perform_request(args.service, "models")
if "models" in response:
for model in response["models"]:
print(model)
if "default_model" in response:
print("Default model:", response["default_model"])
def process(args, data):
data = {
"input": args.input,
"output": args.output,
"data": data,
}
for option in ["model", "tokenizer", "parser", "tagger"]:
value = getattr(args, option)
if value is not None:
data[option] = value
response = perform_request(args.service, "process", data)
if "model" not in response or "result" not in response:
raise ValueError("Cannot parse the UDPipe 'process' REST request response.")
print("UDPipe generated an output using the model '{}'.".format(response["model"]), file=sys.stderr)
print("Please respect the model licence (CC BY-NC-SA unless stated otherwise).", file=sys.stderr)
return response["result"]
if __name__ == "__main__":
# Parse the client arguments.
parser = argparse.ArgumentParser(description=(
"Most of the options are passed directly to the service. For documentation, "
"see https://lindat.mff.cuni.cz/services/udpipe/api-reference.php ."))
parser.add_argument("inputs", nargs="*", type=str, help="Optional input files; stdin if not specified.")
parser.add_argument("--list_models", default=False, action="store_true", help="List available models")
parser.add_argument("--input", default="conllu", type=str, help="Input format")
parser.add_argument("--model", default=None, type=str, help="Model to use")
parser.add_argument("--output", default="conllu", type=str, help="Output format")
parser.add_argument("--parser", default=None, type=str, help="Parser options")
parser.add_argument("--tagger", default=None, type=str, help="Tagger options")
parser.add_argument("--tokenizer", default=None, type=str, help="Tokenizer options")
parser.add_argument("--outfile", default=None, type=str, help="Output path template; use {} as basename")
parser.add_argument("--service", default="https://lindat.mff.cuni.cz/services/udpipe/api", type=str, help="Service URL")
args = parser.parse_args()
if args.list_models:
list_models(args)
else:
outfile = None # No output file opened.
for input_path in (args.inputs or [sys.stdin]):
# Use stdin if no inputs are specified
if input_path != sys.stdin:
with open(input_path, "r", encoding="utf-8-sig") as input_file:
data = input_file.read()
else:
data = sys.stdin.read()
if args.outfile and not outfile:
outfile = args.outfile.replace("{}", (
os.path.splitext(os.path.basename(input_path))[0] if input_path != sys.stdin else "{}"))
outfile = open(outfile, "w", encoding="utf-8")
(outfile or sys.stdout).write(process(args, data))
if args.outfile and "{}" in args.outfile:
outfile.close()
outfile = None
if outfile:
outfile.close()