File size: 7,636 Bytes
8c0b646
 
d5cc744
ca69fee
d5cc744
ca69fee
8c0b646
ba4ce98
19d0d27
ba4ce98
 
 
 
 
 
 
 
 
 
 
e20eecd
ba4ce98
 
 
fce4c33
 
 
c16370c
 
 
 
fce4c33
eabdff9
fce4c33
eabdff9
fce4c33
eabdff9
 
fce4c33
 
d5cc744
ca69fee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5cc744
 
 
 
 
 
 
 
 
2ce1788
d5cc744
ca69fee
 
 
 
d5cc744
 
ca69fee
 
d5cc744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca69fee
fce4c33
c16370c
cd3e092
 
 
 
 
 
 
 
 
 
 
c16370c
 
cd3e092
eabdff9
 
 
cd3e092
c16370c
 
cd3e092
 
 
 
 
 
 
 
1ca245c
c16370c
cd3e092
ba4ce98
 
1ca245c
ba4ce98
3566540
ba4ce98
e20eecd
c16370c
ba4ce98
ca69fee
8c0b646
e3a2d6f
c16370c
ba4ce98
c16370c
19d0d27
8c0b646
 
3566540
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
from transformers import AutoModelForMaskedLM

ex_str1 = "A crustless sandwich made from two slices of baked bread. The sandwich includes first and second matching " \
          "crustless bread pieces. The bread pieces have the same general outer shape defined by an outer periphery " \
          "with central portions surrounded by an outer peripheral area, the bread pieces being at least partially " \
          "crimped together at the outer peripheral area."

ex_str2 = "The present disclosure provides a DNA-targeting RNA that comprises a targeting sequence and, together with" \
          " a modifying polypeptide, provides for site-specific modification of a target DNA and/or a polypeptide" \
          " associated with the target DNA. "

ex_str3 = "The graphite plane is composed of a two-dimensional hexagonal lattice of carbon atoms and the plate has a " \
          "length and a width parallel to the graphite plane and a thickness orthogonal to the graphite plane with at " \
          "least one of the length, width, and thickness values being 100 nanometers or smaller. "

examples = [[ex_str1, 1.2, 1],
           [ex_str2, 1.5, 10],
           [ex_str3, 1.4, 20]]


def add_mask(text, size=1):
    split_text = text.split()

    # If the user supplies a mask, don't add more
    if '[MASK]' in split_text:
        return text
    idx = np.random.randint(len(split_text), size=size)
    masked_strings = []
    for i in idx:
        masked_strings.append(split_text[i])
        split_text[i] = '[MASK]'
    masked_output = ' '.join(split_text)
    return masked_output, masked_strings


class TempScalePipe(FillMaskPipeline):
    def _sanitize_parameters(self, top_k=None, targets=None, temp=None):
        postprocess_params = {}

        if targets is not None:
            target_ids = self.get_target_ids(targets, top_k)
            postprocess_params["target_ids"] = target_ids

        if top_k is not None:
            postprocess_params["top_k"] = top_k

        if temp is not None:
            postprocess_params["temp"] = temp
        return {}, {}, postprocess_params


    def __call__(self, inputs, *args, **kwargs):
        """
        Fill the masked token in the text(s) given as inputs.

        Args:
            args (`str` or `List[str]`):
                One or several texts (or one list of prompts) with masked tokens.
            targets (`str` or `List[str]`, *optional*):
                When passed, the model will limit the scores to the passed targets instead of looking up in the whole
                vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
                resulting token will be used (with a warning, and that might be slower).
            top_k (`int`, *optional*):
                When passed, overrides the number of predictions to return.

        Return:
            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:

            - **sequence** (`str`) -- The corresponding input with the mask token prediction.
            - **score** (`float`) -- The corresponding probability.
            - **token** (`int`) -- The predicted token id (to replace the masked one).
            - **token** (`str`) -- The predicted token (to replace the masked one).
        """
        outputs = super().__call__(inputs, **kwargs)
        if isinstance(inputs, list) and len(inputs) == 1:
            return outputs[0]
        return outputs

    def postprocess(self, model_outputs, top_k=10, target_ids=None, temp=1):
        # Cap top_k if there are targets
        if target_ids is not None and target_ids.shape[0] < top_k:
            top_k = target_ids.shape[0]
        input_ids = model_outputs["input_ids"][0]
        outputs = model_outputs["logits"]

        masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
        # Fill mask pipeline supports only one ${mask_token} per sample

        logits = outputs[0, masked_index, :] / temp
        probs = logits.softmax(dim=-1)
        sampling = False
        if sampling:
            predictions = torch.multinomial(probs, num_samples=3)
            values = probs[0, predictions]
        if target_ids is not None:
            probs = probs[..., target_ids]
        if not sampling:
            values, predictions = probs.topk(top_k)

        result = []
        single_mask = values.shape[0] == 1
        for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
            row = []
            for v, p in zip(_values, _predictions):
                # Copy is important since we're going to modify this array in place
                tokens = input_ids.numpy().copy()
                if target_ids is not None:
                    p = target_ids[p].tolist()

                tokens[masked_index[i]] = p
                # Filter padding out:
                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
                # Originally we skip special tokens to give readable output.
                # For multi masks though, the other [MASK] would be removed otherwise
                # making the output look odd, so we add them back
                sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
                proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
                row.append(proposition)
            result.append(row)
        if single_mask:
            return result[0]
        return result


PIPELINE_REGISTRY.register_pipeline(
    "temp-scale",
    pipeline_class=TempScalePipe,
    pt_model=AutoModelForMaskedLM,
)
scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")


def sample_output(out, sampling):
    score_to_str = {out[k]: k for k in out.keys()}
    score_list = list(score_to_str.keys())
    if sampling == 'multi':
        idx = np.argmax(np.random.multinomial(1, score_list, 1))
    else:
        idx = np.random.randint(0, len(score_list))
    score = score_list[idx]
    return score_to_str[score]


def unmask(text, temp, rounds):
    sampling = 'multi'
    for _ in range(rounds):
        tp = add_mask(text, size=1)
        masked_text, masked = tp[0], tp[1]
        split_text = masked_text.split()
        res = scrambler(masked_text, temp=temp, top_k=15)
        mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
        out = {item["token_str"]: item["score"] for item in res}
        new_token = sample_output(out, sampling)
        unsuccessful_iters = 0
        while new_token == masked[0]:
            if unsuccessful_iters > 5:
                break
            print(new_token)
            new_token = sample_output(out, sampling='uniform')
            unsuccessful_iters += 1
        split_text[mask_pos] = '*' + new_token + '*'
        text = ' '.join(split_text)

    text = list(text)
    text[0] = text[0].upper()
    return ''.join(text)


textbox = gr.Textbox(label="Example prompts", lines=5)
output_textbox = gr.Textbox(placeholder="Output will appear here", lines=4)
temp_slider = gr.Slider(1.0, 2.0, value=1.0, label='Creativity')
edit_slider = gr.Slider(1, 150, step=5, value=1.0, label='Number of edits')

demo = gr.Interface(
    fn=unmask,
    inputs=[textbox, temp_slider, edit_slider],
    outputs=[output_textbox],
    examples=examples,
    allow_flagging='never'
)

demo.launch()