Spaces:
Build error
Build error
File size: 7,636 Bytes
8c0b646 d5cc744 ca69fee d5cc744 ca69fee 8c0b646 ba4ce98 19d0d27 ba4ce98 e20eecd ba4ce98 fce4c33 c16370c fce4c33 eabdff9 fce4c33 eabdff9 fce4c33 eabdff9 fce4c33 d5cc744 ca69fee d5cc744 2ce1788 d5cc744 ca69fee d5cc744 ca69fee d5cc744 ca69fee fce4c33 c16370c cd3e092 c16370c cd3e092 eabdff9 cd3e092 c16370c cd3e092 1ca245c c16370c cd3e092 ba4ce98 1ca245c ba4ce98 3566540 ba4ce98 e20eecd c16370c ba4ce98 ca69fee 8c0b646 e3a2d6f c16370c ba4ce98 c16370c 19d0d27 8c0b646 3566540 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
from transformers import AutoModelForMaskedLM
ex_str1 = "A crustless sandwich made from two slices of baked bread. The sandwich includes first and second matching " \
"crustless bread pieces. The bread pieces have the same general outer shape defined by an outer periphery " \
"with central portions surrounded by an outer peripheral area, the bread pieces being at least partially " \
"crimped together at the outer peripheral area."
ex_str2 = "The present disclosure provides a DNA-targeting RNA that comprises a targeting sequence and, together with" \
" a modifying polypeptide, provides for site-specific modification of a target DNA and/or a polypeptide" \
" associated with the target DNA. "
ex_str3 = "The graphite plane is composed of a two-dimensional hexagonal lattice of carbon atoms and the plate has a " \
"length and a width parallel to the graphite plane and a thickness orthogonal to the graphite plane with at " \
"least one of the length, width, and thickness values being 100 nanometers or smaller. "
examples = [[ex_str1, 1.2, 1],
[ex_str2, 1.5, 10],
[ex_str3, 1.4, 20]]
def add_mask(text, size=1):
split_text = text.split()
# If the user supplies a mask, don't add more
if '[MASK]' in split_text:
return text
idx = np.random.randint(len(split_text), size=size)
masked_strings = []
for i in idx:
masked_strings.append(split_text[i])
split_text[i] = '[MASK]'
masked_output = ' '.join(split_text)
return masked_output, masked_strings
class TempScalePipe(FillMaskPipeline):
def _sanitize_parameters(self, top_k=None, targets=None, temp=None):
postprocess_params = {}
if targets is not None:
target_ids = self.get_target_ids(targets, top_k)
postprocess_params["target_ids"] = target_ids
if top_k is not None:
postprocess_params["top_k"] = top_k
if temp is not None:
postprocess_params["temp"] = temp
return {}, {}, postprocess_params
def __call__(self, inputs, *args, **kwargs):
"""
Fill the masked token in the text(s) given as inputs.
Args:
args (`str` or `List[str]`):
One or several texts (or one list of prompts) with masked tokens.
targets (`str` or `List[str]`, *optional*):
When passed, the model will limit the scores to the passed targets instead of looking up in the whole
vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
resulting token will be used (with a warning, and that might be slower).
top_k (`int`, *optional*):
When passed, overrides the number of predictions to return.
Return:
A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
- **sequence** (`str`) -- The corresponding input with the mask token prediction.
- **score** (`float`) -- The corresponding probability.
- **token** (`int`) -- The predicted token id (to replace the masked one).
- **token** (`str`) -- The predicted token (to replace the masked one).
"""
outputs = super().__call__(inputs, **kwargs)
if isinstance(inputs, list) and len(inputs) == 1:
return outputs[0]
return outputs
def postprocess(self, model_outputs, top_k=10, target_ids=None, temp=1):
# Cap top_k if there are targets
if target_ids is not None and target_ids.shape[0] < top_k:
top_k = target_ids.shape[0]
input_ids = model_outputs["input_ids"][0]
outputs = model_outputs["logits"]
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
# Fill mask pipeline supports only one ${mask_token} per sample
logits = outputs[0, masked_index, :] / temp
probs = logits.softmax(dim=-1)
sampling = False
if sampling:
predictions = torch.multinomial(probs, num_samples=3)
values = probs[0, predictions]
if target_ids is not None:
probs = probs[..., target_ids]
if not sampling:
values, predictions = probs.topk(top_k)
result = []
single_mask = values.shape[0] == 1
for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
row = []
for v, p in zip(_values, _predictions):
# Copy is important since we're going to modify this array in place
tokens = input_ids.numpy().copy()
if target_ids is not None:
p = target_ids[p].tolist()
tokens[masked_index[i]] = p
# Filter padding out:
tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
# Originally we skip special tokens to give readable output.
# For multi masks though, the other [MASK] would be removed otherwise
# making the output look odd, so we add them back
sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
row.append(proposition)
result.append(row)
if single_mask:
return result[0]
return result
PIPELINE_REGISTRY.register_pipeline(
"temp-scale",
pipeline_class=TempScalePipe,
pt_model=AutoModelForMaskedLM,
)
scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")
def sample_output(out, sampling):
score_to_str = {out[k]: k for k in out.keys()}
score_list = list(score_to_str.keys())
if sampling == 'multi':
idx = np.argmax(np.random.multinomial(1, score_list, 1))
else:
idx = np.random.randint(0, len(score_list))
score = score_list[idx]
return score_to_str[score]
def unmask(text, temp, rounds):
sampling = 'multi'
for _ in range(rounds):
tp = add_mask(text, size=1)
masked_text, masked = tp[0], tp[1]
split_text = masked_text.split()
res = scrambler(masked_text, temp=temp, top_k=15)
mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
out = {item["token_str"]: item["score"] for item in res}
new_token = sample_output(out, sampling)
unsuccessful_iters = 0
while new_token == masked[0]:
if unsuccessful_iters > 5:
break
print(new_token)
new_token = sample_output(out, sampling='uniform')
unsuccessful_iters += 1
split_text[mask_pos] = '*' + new_token + '*'
text = ' '.join(split_text)
text = list(text)
text[0] = text[0].upper()
return ''.join(text)
textbox = gr.Textbox(label="Example prompts", lines=5)
output_textbox = gr.Textbox(placeholder="Output will appear here", lines=4)
temp_slider = gr.Slider(1.0, 2.0, value=1.0, label='Creativity')
edit_slider = gr.Slider(1, 150, step=5, value=1.0, label='Number of edits')
demo = gr.Interface(
fn=unmask,
inputs=[textbox, temp_slider, edit_slider],
outputs=[output_textbox],
examples=examples,
allow_flagging='never'
)
demo.launch()
|