|
import json |
|
import argparse |
|
from tqdm import tqdm |
|
|
|
def process_line(line, old_text, new_text): |
|
|
|
data = json.loads(line) |
|
|
|
|
|
def replace_text(obj): |
|
if isinstance(obj, dict): |
|
return {k: replace_text(v) for k, v in obj.items()} |
|
elif isinstance(obj, list): |
|
return [replace_text(item) for item in obj] |
|
elif isinstance(obj, str): |
|
return obj.replace(old_text, new_text) |
|
else: |
|
return obj |
|
|
|
|
|
processed_data = replace_text(data) |
|
|
|
|
|
return json.dumps(processed_data, ensure_ascii=False) |
|
|
|
def main(input_file, output_file, old_text, new_text): |
|
with open(input_file, 'r', encoding='utf-8') as infile, \ |
|
open(output_file, 'w', encoding='utf-8') as outfile: |
|
|
|
|
|
total_lines = sum(1 for _ in infile) |
|
infile.seek(0) |
|
|
|
|
|
for line in tqdm(infile, total=total_lines, desc="Processing"): |
|
processed_line = process_line(line.strip(), old_text, new_text) |
|
outfile.write(processed_line + '\n') |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="Replace text in a JSONL file.") |
|
parser.add_argument("input_file", help="Input JSONL file to process") |
|
parser.add_argument("output_file", help="Output file for processed JSONL") |
|
parser.add_argument("--old_text", default="尖米", help="Text to be replaced") |
|
parser.add_argument("--new_text", default="FAYO", help="Text to replace with") |
|
args = parser.parse_args() |
|
|
|
main(args.input_file, args.output_file, args.old_text, args.new_text) |