haydn-jones commited on
Commit
097ab1f
·
1 Parent(s): d8e9dcb

Upload tokenizer.ipynb

Browse files
Files changed (1) hide show
  1. utils/tokenizer.ipynb +114 -0
utils/tokenizer.ipynb ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import selfies as sf\n",
10
+ "from tokenizers import Tokenizer\n",
11
+ "from tokenizers.models import WordLevel\n",
12
+ "from tokenizers.pre_tokenizers import Split\n",
13
+ "from tokenizers.processors import TemplateProcessing\n",
14
+ "from tokenizers.trainers import WordLevelTrainer\n",
15
+ "from tqdm import tqdm"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": null,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "with open(\"./train.txt\") as f:\n",
25
+ " smiles = [line.strip() for line in f]\n",
26
+ "\n",
27
+ "selfies = []\n",
28
+ "for smile in tqdm(smiles):\n",
29
+ " try:\n",
30
+ " selfies.append(sf.encoder(smile))\n",
31
+ " except:\n",
32
+ " pass"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "tokenizer = Tokenizer(WordLevel(unk_token=\"<UNK>\"))\n",
42
+ "\n",
43
+ "tokenizer.pre_tokenizer = Split(\n",
44
+ " pattern=\"]\", \n",
45
+ " behavior=\"merged_with_previous\"\n",
46
+ ")\n",
47
+ "\n",
48
+ "trainer = WordLevelTrainer(\n",
49
+ " special_tokens=[\"<CLS>\", \"<EOS>\", \"<PAD>\", \"<UNK>\"]\n",
50
+ ")\n",
51
+ "\n",
52
+ "tokenizer.train_from_iterator(selfies, trainer=trainer)"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "tokenizer.post_processor = TemplateProcessing(\n",
62
+ " single=\"<CLS> $A <EOS>\",\n",
63
+ " special_tokens=[\n",
64
+ " (\"<CLS>\", tokenizer.token_to_id(\"<CLS>\")),\n",
65
+ " (\"<EOS>\", tokenizer.token_to_id(\"<EOS>\")),\n",
66
+ " ],\n",
67
+ ")\n",
68
+ "\n",
69
+ "tokenizer.enable_padding(\n",
70
+ " direction=\"right\",\n",
71
+ " pad_id=tokenizer.token_to_id(\"<PAD>\"),\n",
72
+ " pad_token=\"<PAD>\",\n",
73
+ ")"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "tokenizer.save(\"./tokenizer.json\")"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": null,
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": []
91
+ }
92
+ ],
93
+ "metadata": {
94
+ "kernelspec": {
95
+ "display_name": "ddpm",
96
+ "language": "python",
97
+ "name": "python3"
98
+ },
99
+ "language_info": {
100
+ "codemirror_mode": {
101
+ "name": "ipython",
102
+ "version": 3
103
+ },
104
+ "file_extension": ".py",
105
+ "mimetype": "text/x-python",
106
+ "name": "python",
107
+ "nbconvert_exporter": "python",
108
+ "pygments_lexer": "ipython3",
109
+ "version": "3.11.6"
110
+ }
111
+ },
112
+ "nbformat": 4,
113
+ "nbformat_minor": 2
114
+ }