File size: 5,285 Bytes
0e5da39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3

# This file is part of UDPipe 2 <http://github.com/ufal/udpipe>.
#
# Copyright 2022 Institute of Formal and Applied Linguistics, Faculty of
# Mathematics and Physics, Charles University in Prague, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import argparse
import email.mime.multipart
import email.mime.nonmultipart
import email.policy
import json
import os
import sys
import urllib.error
import urllib.request

__version__ = "2.1.1-dev"


def perform_request(server, method, params={}):
    if not params:
        request_headers, request_data = {}, None
    else:
        message = email.mime.multipart.MIMEMultipart("form-data", policy=email.policy.HTTP)

        for name, value in params.items():
            payload = email.mime.nonmultipart.MIMENonMultipart("text", "plain")
            payload.add_header("Content-Disposition", "form-data; name=\"{}\"".format(name))
            payload.add_header("Content-Transfer-Encoding", "8bit")
            payload.set_payload(value, charset="utf-8")
            message.attach(payload)

        request_data = message.as_bytes().split(b"\r\n\r\n", maxsplit=1)[1]
        request_headers = {"Content-Type": message["Content-Type"]}

    try:
        with urllib.request.urlopen(urllib.request.Request(
            url="{}/{}".format(server, method), headers=request_headers, data=request_data
        )) as request:
            return json.loads(request.read())
    except urllib.error.HTTPError as e:
        print("An exception was raised during UDPipe 'process' REST request.\n"
              "The service returned the following error:\n"
              "  {}".format(e.fp.read().decode("utf-8")), file=sys.stderr)
        raise
    except json.JSONDecodeError as e:
        print("Cannot parse the JSON response of UDPipe 'process' REST request.\n"
              "  {}".format(e.msg), file=sys.stderr)
        raise


def list_models(args):
    response = perform_request(args.service, "models")
    if "models" in response:
        for model in response["models"]:
            print(model)
    if "default_model" in response:
        print("Default model:", response["default_model"])


def process(args, data):
    data = {
        "input": args.input,
        "output": args.output,
        "data": data,
    }
    for option in ["model", "tokenizer", "parser", "tagger"]:
        value = getattr(args, option)
        if value is not None:
            data[option] = value

    response = perform_request(args.service, "process", data)
    if "model" not in response or "result" not in response:
        raise ValueError("Cannot parse the UDPipe 'process' REST request response.")

    print("UDPipe generated an output using the model '{}'.".format(response["model"]), file=sys.stderr)
    print("Please respect the model licence (CC BY-NC-SA unless stated otherwise).", file=sys.stderr)

    return response["result"]


if __name__ == "__main__":
    # Parse the client arguments.
    parser = argparse.ArgumentParser(description=(
        "Most of the options are passed directly to the service. For documentation, "
        "see https://lindat.mff.cuni.cz/services/udpipe/api-reference.php ."))
    parser.add_argument("inputs", nargs="*", type=str, help="Optional input files; stdin if not specified.")
    parser.add_argument("--list_models", default=False, action="store_true", help="List available models")
    parser.add_argument("--input", default="conllu", type=str, help="Input format")
    parser.add_argument("--model", default=None, type=str, help="Model to use")
    parser.add_argument("--output", default="conllu", type=str, help="Output format")
    parser.add_argument("--parser", default=None, type=str, help="Parser options")
    parser.add_argument("--tagger", default=None, type=str, help="Tagger options")
    parser.add_argument("--tokenizer", default=None, type=str, help="Tokenizer options")
    parser.add_argument("--outfile", default=None, type=str, help="Output path template; use {} as basename")
    parser.add_argument("--service", default="https://lindat.mff.cuni.cz/services/udpipe/api", type=str, help="Service URL")
    args = parser.parse_args()

    if args.list_models:
        list_models(args)
    else:
        outfile = None  # No output file opened.

        for input_path in (args.inputs or [sys.stdin]):
            # Use stdin if no inputs are specified
            if input_path != sys.stdin:
                with open(input_path, "r", encoding="utf-8-sig") as input_file:
                    data = input_file.read()
            else:
                data = sys.stdin.read()

            if args.outfile and not outfile:
                outfile = args.outfile.replace("{}", (
                    os.path.splitext(os.path.basename(input_path))[0] if input_path != sys.stdin else "{}"))
                outfile = open(outfile, "w", encoding="utf-8")

            (outfile or sys.stdout).write(process(args, data))

            if args.outfile and "{}" in args.outfile:
                outfile.close()
                outfile = None

        if outfile:
            outfile.close()