Spaces:

B-patents
/

patent-bert

Build error

danseith commited on Feb 26, 2023

Commit

cd3e092

1 Parent(s): 020fa3d

Fixed loop structure and output sampling to avoid infinite loops. Now allows deletions.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -135,34 +135,37 @@ PIPELINE_REGISTRY.register_pipeline(
 scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")
 def unmask(text, temp, rounds):
     sampling = 'multi'
-    successful_iters = 0
-    unsuccessful_iters = 0
-    while successful_iters < rounds or unsuccessful_iters > 5:
-        unsuccessful_iters += 1
         tp = add_mask(text, size=1)
         masked_text, masked = tp[0], tp[1]
         split_text = masked_text.split()
-        res = scrambler(masked_text, temp=temp, top_k=10)
         mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
         out = {item["token_str"]: item["score"] for item in res}
-        score_to_str = {out[k] : k for k in out.keys()}
-        score_list = list(score_to_str.keys())
-        if sampling == 'multi':
-            idx = np.argmax(np.random.multinomial(1, score_list, 1))
-        else:
-            idx = np.random.randint(0, len(score_list))
-        score = score_list[idx]
-        new_token = score_to_str[score]
-        if len(list(new_token)) < 2 or new_token == masked[0]:
-            continue
         split_text[mask_pos] = '*' + new_token + '*'
         text = ' '.join(split_text)
-        successful_iters += 1
-        unsuccessful_iters -= 1
-    if unsuccessful_iters > 5:
-        text = "Ran into an issue :( Please try again."
     text = list(text)
     text[0] = text[0].upper()
     return ''.join(text)

 scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")
+def sample_output(out, sampling):
+    score_to_str = {out[k]: k for k in out.keys()}
+    score_list = list(score_to_str.keys())
+    if sampling == 'multi':
+        idx = np.argmax(np.random.multinomial(1, score_list, 1))
+    else:
+        idx = np.random.randint(0, len(score_list))
+    score = score_list[idx]
+    return score_to_str[score]
 def unmask(text, temp, rounds):
     sampling = 'multi'
+    for _ in range(rounds):
         tp = add_mask(text, size=1)
         masked_text, masked = tp[0], tp[1]
         split_text = masked_text.split()
+        res = scrambler(masked_text, temp=temp, top_k=15)
         mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
         out = {item["token_str"]: item["score"] for item in res}
+        new_token = sample_output(out, sampling)
+        unsuccessful_iters = 0
+        while new_token == masked[0]:
+            if unsuccessful_iters > 5:
+                break
+            print(new_token)
+            new_token = sample_output(out, sampling='uniform')
+            unsuccessful_iters += 1
         split_text[mask_pos] = '*' + new_token + '*'
         text = ' '.join(split_text)
     text = list(text)
     text[0] = text[0].upper()
     return ''.join(text)