minko186 commited on
Commit
20d4ded
·
1 Parent(s): 20dc449

keep para structure

Browse files
Files changed (1) hide show
  1. humanize.py +33 -23
humanize.py CHANGED
@@ -71,27 +71,37 @@ def paraphrase_text(
71
  tokenizer = tokenizers[model_name]
72
  model = models[model_name]
73
 
74
- # paraphrase each chunk of text
75
- sentences = sent_tokenize(text) # sentence boundary detection
76
- paraphrases = []
77
- for sentence in progress.tqdm(sentences, desc="Humanizing"):
78
- sentence = sentence.strip()
79
- if len(sentence) == 0:
80
- continue
81
- inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
82
- outputs = model.generate(
83
- **inputs,
84
- do_sample=True,
85
- temperature=temperature,
86
- repetition_penalty=repetition_penalty,
87
- max_length=128,
88
- top_k=top_k,
89
- length_penalty=length_penalty,
90
- )
91
- paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
92
- paraphrases.append(paraphrased_sentence)
93
- print(f"\nOriginal: {sentence}")
94
- print(f"Paraphrased: {paraphrased_sentence}")
95
 
96
- combined_paraphrase = " ".join(paraphrases)
97
- return combined_paraphrase
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  tokenizer = tokenizers[model_name]
72
  model = models[model_name]
73
 
74
+ # Split the text into paragraphs
75
+ paragraphs = text.split("\n")
76
+ humanized_paragraphs = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ for paragraph in paragraphs:
79
+ # paraphrase each chunk of text
80
+ sentences = sent_tokenize(paragraph)
81
+ paraphrases = []
82
+ for sentence in progress.tqdm(sentences, desc="Humanizing"):
83
+ sentence = sentence.strip()
84
+ if len(sentence) == 0:
85
+ continue
86
+ inputs = tokenizer(
87
+ "Please paraphrase this sentence: " + sentence,
88
+ return_tensors="pt",
89
+ ).to(device)
90
+ outputs = model.generate(
91
+ **inputs,
92
+ do_sample=True,
93
+ temperature=temperature,
94
+ repetition_penalty=repetition_penalty,
95
+ max_length=128,
96
+ top_k=top_k,
97
+ length_penalty=length_penalty,
98
+ )
99
+ paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
100
+ paraphrases.append(paraphrased_sentence)
101
+ print(f"\nOriginal: {sentence}")
102
+ print(f"Paraphrased: {paraphrased_sentence}")
103
+ combined_paraphrase = " ".join(paraphrases)
104
+ humanized_paragraphs.append(combined_paraphrase)
105
+
106
+ humanized_text = "\n".join(humanized_paragraphs)
107
+ return humanized_text