Vikrantyadav11234 commited on
Commit
da7b0e1
·
verified ·
1 Parent(s): 28d885f

Upload clean_inline_tags_file.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. clean_inline_tags_file.py +16 -0
clean_inline_tags_file.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def clean_inline_tag_file(input_file, output_file):
2
+ with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
3
+ for line in infile:
4
+ # Split by tabs and ensure at least two elements are present
5
+ parts = line.strip().split('\t')
6
+ if len(parts) >= 2:
7
+ # Extract source and target removing language tags
8
+ source = parts[0].split(': ')[-1]
9
+ target = parts[1].split(': ')[-1]
10
+ # Write the cleaned pair to the output file
11
+ outfile.write(f"{source}\t{target}\n")
12
+
13
+ # Example usage
14
+ input_file = '/home/vikrant-MNMT/myenv/NMT_V2/important_files/date_tags.txt' # Replace with actual input file path
15
+ output_file = '/home/vikrant-MNMT/myenv/BPCC/inline_tages/eng_Latn-hin_Deva/date_tag_cleaned.txt' # Replace with desired output file path
16
+ clean_inline_tag_file(input_file, output_file)