{ "cells": [ { "cell_type": "markdown", "id": "4ebeeae6", "metadata": { "toc": true }, "source": [ "

Содержание

\n", "
" ] }, { "cell_type": "markdown", "id": "12444a0f", "metadata": {}, "source": [ "# collect texts" ] }, { "cell_type": "code", "execution_count": 12, "id": "f3ddc049", "metadata": { "ExecuteTime": { "end_time": "2024-06-15T22:41:10.093684Z", "start_time": "2024-06-15T22:41:10.089042Z" } }, "outputs": [], "source": [ "import html\n", "import os\n", "import re\n", "import shutil\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": null, "id": "9e0e1c77", "metadata": {}, "outputs": [], "source": [ "!pip install razdel" ] }, { "cell_type": "code", "execution_count": 10, "id": "9470fce5", "metadata": { "ExecuteTime": { "end_time": "2024-06-15T22:40:47.962801Z", "start_time": "2024-06-15T22:40:47.854033Z" } }, "outputs": [], "source": [ "from razdel import sentenize\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": null, "id": "68b54deb", "metadata": {}, "outputs": [], "source": [ "!git clone https://github.com/tolstoydigital/TEI.git" ] }, { "cell_type": "code", "execution_count": 3, "id": "0d9b57a0", "metadata": { "ExecuteTime": { "end_time": "2024-06-15T22:38:10.450986Z", "start_time": "2024-06-15T22:38:10.446063Z" } }, "outputs": [], "source": [ "relevant_dirs = ['diaries', 'letters', 'notes', 'works']" ] }, { "cell_type": "code", "execution_count": 4, "id": "375755d6", "metadata": { "ExecuteTime": { "end_time": "2024-06-15T22:38:26.335811Z", "start_time": "2024-06-15T22:38:25.268500Z" } }, "outputs": [], "source": [ "path = 'TEI/reference/bibllist_works.xml'\n", "xml = open(path).read()\n", "soup = BeautifulSoup(xml, features=\"xml\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "f69acb38", "metadata": { "ExecuteTime": { "end_time": "2024-06-15T22:38:48.857463Z", "start_time": "2024-06-15T22:38:48.668146Z" } }, "outputs": [], "source": [ "group_texts = {}\n", "for it in soup.find_all(\"item\"):\n", " ref = it.find(\"ref\")\n", " for related in it.find_all(\"relatedItem\"):\n", " for ref_ana in related.find_all(\"ref\"):\n", " group_texts[ref_ana.text] = ref.text" ] }, { "cell_type": "code", "execution_count": 6, "id": "182964df", "metadata": { "ExecuteTime": { "end_time": "2024-06-15T22:39:31.844575Z", "start_time": "2024-06-15T22:39:31.796678Z" } }, "outputs": [], "source": [ "prefix_texts = 'extracted_texts'\n", "os.mkdir(prefix_texts)" ] }, { "cell_type": "code", "execution_count": 13, "id": "f75563a7", "metadata": { "ExecuteTime": { "end_time": "2024-06-15T22:48:24.143625Z", "start_time": "2024-06-15T22:41:13.138087Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 4584/4584 [01:26<00:00, 53.24it/s] \n", "100%|██████████| 9087/9087 [02:26<00:00, 61.89it/s] \n", "100%|██████████| 100/100 [00:28<00:00, 3.49it/s]\n", "100%|██████████| 767/767 [02:49<00:00, 4.53it/s]\n" ] } ], "source": [ "if os.path.exists(prefix_texts):\n", " shutil.rmtree(prefix_texts)\n", "os.mkdir(prefix_texts)\n", "\n", "complex_texts = {}\n", "for rel_dir in relevant_dirs:\n", " path = os.path.join('TEI/texts', rel_dir)\n", " for file in tqdm(sorted(os.listdir(path))):\n", " fiction = 0\n", " if not file.endswith('.xml'):\n", " continue\n", " xml = open(os.path.join(path, file)).read()\n", " if 'Печатные варианты' in xml:\n", " continue\n", " nameID = file.replace('.xml', '')\n", " soup = BeautifulSoup(xml, features=\"xml\")\n", " if soup.find(\"catRef\", {\"ana\":\"#fiction\"}):\n", " fiction = 1\n", " s = soup.find(\"body\")\n", " paragraphs = []\n", " for erase in s.find_all([\"orig\", \"comments\", \"sic\", \"note\"]):\n", " erase.decompose()\n", " for p in s.find_all([\"p\", \"l\"]):\n", " paragraphs.append(html.unescape(p.text.replace('\\n', ' ').strip()))\n", " if not fiction:\n", " with open(os.path.join(prefix_texts, rel_dir + '.txt'), 'a') as f:\n", " for par in paragraphs:\n", " par = re.sub(' ([.,;:!?)\"»])', '\\\\1', par)\n", " par = par.replace('\\n', ' ')\n", " par = par.strip()\n", " par = re.sub('\\s+', ' ', par)\n", " par = re.sub('\\[.+?\\]', '', par)\n", " for sent in sentenize(par):\n", " f.write(list(sent)[2].strip() + '\\n')\n", " else:\n", " if nameID in group_texts:\n", " hyper_name = group_texts[nameID]\n", " if hyper_name not in complex_texts:\n", " complex_texts[hyper_name] = paragraphs\n", " else:\n", " complex_texts[hyper_name].extend(paragraphs)\n", " else:\n", " with open(os.path.join(prefix_texts, nameID + '.txt'), 'w') as f:\n", " f.write('\\n'.join(paragraphs))\n", "for hyper_name in complex_texts:\n", " with open(os.path.join(prefix_texts, hyper_name + '.txt'), 'w') as f:\n", " f.write('\\n'.join(complex_texts[hyper_name]))" ] }, { "cell_type": "markdown", "id": "1eddfe2e", "metadata": {}, "source": [ "# tagging" ] }, { "cell_type": "code", "execution_count": 14, "id": "115d0c54", "metadata": { "ExecuteTime": { "end_time": "2024-06-15T23:52:40.491525Z", "start_time": "2024-06-15T23:52:40.416283Z" } }, "outputs": [], "source": [ "from pymystem3 import Mystem" ] }, { "cell_type": "code", "execution_count": null, "id": "c9441d01", "metadata": { "ExecuteTime": { "end_time": "2024-06-15T23:53:11.904746Z", "start_time": "2024-06-15T23:53:11.901127Z" } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "id": "6b1d6b0d", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:03:59.149011Z", "start_time": "2024-06-16T00:03:59.132762Z" } }, "outputs": [], "source": [ "def tagging():\n", " m = Mystem()\n", " for fl in os.listdir(prefix_texts):\n", " #print(fl)\n", " if 'mystem' in fl:\n", " continue\n", " with open(os.path.join(prefix_texts, fl)) as f:\n", " text = f.read()\n", " lines = text.split('\\n')\n", " ana_lines = []\n", " for line in lines:\n", " line = ' '.join(line.split()[1:])\n", " line = line.replace('ò', 'о')\n", " line = line.replace('è', 'е')\n", " line = line.replace('à', 'а')\n", " line = line.replace('ѝ', 'и')\n", " line = line.replace('ỳ', 'у')\n", " line = line.replace('о̀', 'о')\n", " #line = line.replace('Изд.̀', 'издательство')\n", " ana = []\n", " info = m.analyze(line)\n", " for token in info:\n", " if \"analysis\" in token:\n", " try:\n", " analysis = token[\"analysis\"][0]\n", " except:\n", " #print(token)\n", " continue\n", " # if \"lex\" in analysis:\n", " lex = analysis[\"lex\"]\n", " #if 'gr' in analysis:\n", " gr = analysis['gr']\n", " #print(gr)\n", " const = gr.split('=')[0]\n", " if ',' in const:\n", " pos = const.split(',')[0]\n", " else:\n", " pos = const\n", " \n", " ana.append('{}_{}'.format(lex, pos))\n", " ln = ' '.join(ana)\n", " if re.search('[А-Яа-я]', ln):\n", " ana_lines.append(ln)\n", " with open('{}/mystem-{}'.format(prefix_texts, fl), 'w') as fw:\n", " fw.write('\\n'.join(ana_lines))" ] }, { "cell_type": "code", "execution_count": 20, "id": "d02fd91a", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:12:05.148374Z", "start_time": "2024-06-16T00:04:01.782191Z" } }, "outputs": [], "source": [ "tagging()" ] }, { "cell_type": "code", "execution_count": 18, "id": "f9384f57", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:03:50.492957Z", "start_time": "2024-06-16T00:03:50.485417Z" } }, "outputs": [], "source": [ "pos = ['S', 'V', 'A', 'ADV']" ] }, { "cell_type": "code", "execution_count": 22, "id": "1bc596d8", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:13:58.529072Z", "start_time": "2024-06-16T00:13:50.475301Z" } }, "outputs": [], "source": [ "def mk_input():\n", " inp = []\n", " for fl in os.listdir(prefix_texts):\n", " if not 'mystem' in fl:\n", " continue\n", " #print(fl)\n", " with open(os.path.join(prefix_texts, fl)) as f:\n", " text = f.read()\n", " lines = text.split('\\n')\n", " for line in lines:\n", " words = []\n", " for w in line.split():\n", " word = w.split('_')\n", " if word[1] in pos:\n", " words.append(w)\n", " if len(words) > 1:\n", " inp.append(' '.join(words))\n", " \n", " with open('input.txt', 'w') as fw:\n", " fw.write('\\n'.join(inp))\n", " \n", "mk_input()" ] }, { "cell_type": "markdown", "id": "82faa45f", "metadata": {}, "source": [ "# build models" ] }, { "cell_type": "code", "execution_count": 26, "id": "2d402b64", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:15:17.796668Z", "start_time": "2024-06-16T00:15:13.952859Z" } }, "outputs": [], "source": [ "import sys\n", "import logging\n", "import gensim" ] }, { "cell_type": "code", "execution_count": 24, "id": "5c994f11", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:14:45.969563Z", "start_time": "2024-06-16T00:14:45.965851Z" } }, "outputs": [], "source": [ "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" ] }, { "cell_type": "code", "execution_count": 27, "id": "3ae5d312", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:15:19.248796Z", "start_time": "2024-06-16T00:15:19.244970Z" } }, "outputs": [], "source": [ "pth = './input.txt'\n", "data = gensim.models.word2vec.LineSentence(pth)" ] }, { "cell_type": "code", "execution_count": 28, "id": "2af9f6a3", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:16:45.088436Z", "start_time": "2024-06-16T00:15:24.270931Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-06-16 03:15:24,273 : INFO : collecting all words and their counts\n", "2024-06-16 03:15:24,278 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", "2024-06-16 03:15:24,440 : INFO : PROGRESS: at sentence #10000, processed 163169 words, keeping 17313 word types\n", "2024-06-16 03:15:24,556 : INFO : PROGRESS: at sentence #20000, processed 296024 words, keeping 23165 word types\n", "2024-06-16 03:15:24,672 : INFO : PROGRESS: at sentence #30000, processed 405629 words, keeping 26267 word types\n", "2024-06-16 03:15:24,800 : INFO : PROGRESS: at sentence #40000, processed 494131 words, keeping 27548 word types\n", "2024-06-16 03:15:24,929 : INFO : PROGRESS: at sentence #50000, processed 582330 words, keeping 28502 word types\n", "2024-06-16 03:15:25,090 : INFO : PROGRESS: at sentence #60000, processed 706637 words, keeping 30127 word types\n", "2024-06-16 03:15:25,233 : INFO : PROGRESS: at sentence #70000, processed 847070 words, keeping 35040 word types\n", "2024-06-16 03:15:25,373 : INFO : PROGRESS: at sentence #80000, processed 993651 words, keeping 36885 word types\n", "2024-06-16 03:15:25,543 : INFO : PROGRESS: at sentence #90000, processed 1136268 words, keeping 38211 word types\n", "2024-06-16 03:15:25,664 : INFO : PROGRESS: at sentence #100000, processed 1224248 words, keeping 38914 word types\n", "2024-06-16 03:15:25,767 : INFO : PROGRESS: at sentence #110000, processed 1315622 words, keeping 39146 word types\n", "2024-06-16 03:15:25,920 : INFO : PROGRESS: at sentence #120000, processed 1402410 words, keeping 41123 word types\n", "2024-06-16 03:15:25,996 : INFO : PROGRESS: at sentence #130000, processed 1445449 words, keeping 43388 word types\n", "2024-06-16 03:15:26,136 : INFO : PROGRESS: at sentence #140000, processed 1568666 words, keeping 45060 word types\n", "2024-06-16 03:15:26,215 : INFO : PROGRESS: at sentence #150000, processed 1625965 words, keeping 46676 word types\n", "2024-06-16 03:15:26,317 : INFO : PROGRESS: at sentence #160000, processed 1676634 words, keeping 47593 word types\n", "2024-06-16 03:15:26,404 : INFO : PROGRESS: at sentence #170000, processed 1740516 words, keeping 48188 word types\n", "2024-06-16 03:15:26,493 : INFO : PROGRESS: at sentence #180000, processed 1813933 words, keeping 48617 word types\n", "2024-06-16 03:15:26,572 : INFO : PROGRESS: at sentence #190000, processed 1873429 words, keeping 49050 word types\n", "2024-06-16 03:15:26,729 : INFO : PROGRESS: at sentence #200000, processed 2053959 words, keeping 51906 word types\n", "2024-06-16 03:15:26,891 : INFO : PROGRESS: at sentence #210000, processed 2244539 words, keeping 54019 word types\n", "2024-06-16 03:15:27,068 : INFO : PROGRESS: at sentence #220000, processed 2470575 words, keeping 56680 word types\n", "2024-06-16 03:15:27,261 : INFO : PROGRESS: at sentence #230000, processed 2707756 words, keeping 58963 word types\n", "2024-06-16 03:15:27,461 : INFO : PROGRESS: at sentence #240000, processed 2944125 words, keeping 60490 word types\n", "2024-06-16 03:15:27,606 : INFO : PROGRESS: at sentence #250000, processed 3100643 words, keeping 61888 word types\n", "2024-06-16 03:15:27,697 : INFO : PROGRESS: at sentence #260000, processed 3175131 words, keeping 62638 word types\n", "2024-06-16 03:15:27,789 : INFO : PROGRESS: at sentence #270000, processed 3246248 words, keeping 63144 word types\n", "2024-06-16 03:15:27,887 : INFO : PROGRESS: at sentence #280000, processed 3324748 words, keeping 63561 word types\n", "2024-06-16 03:15:27,991 : INFO : PROGRESS: at sentence #290000, processed 3406817 words, keeping 64030 word types\n", "2024-06-16 03:15:28,091 : INFO : PROGRESS: at sentence #300000, processed 3491208 words, keeping 64525 word types\n", "2024-06-16 03:15:28,190 : INFO : PROGRESS: at sentence #310000, processed 3567554 words, keeping 64999 word types\n", "2024-06-16 03:15:28,300 : INFO : PROGRESS: at sentence #320000, processed 3653814 words, keeping 65369 word types\n", "2024-06-16 03:15:28,397 : INFO : PROGRESS: at sentence #330000, processed 3727990 words, keeping 65832 word types\n", "2024-06-16 03:15:28,482 : INFO : PROGRESS: at sentence #340000, processed 3785783 words, keeping 66225 word types\n", "2024-06-16 03:15:28,561 : INFO : PROGRESS: at sentence #350000, processed 3851922 words, keeping 66596 word types\n", "2024-06-16 03:15:28,682 : INFO : PROGRESS: at sentence #360000, processed 3955668 words, keeping 67421 word types\n", "2024-06-16 03:15:28,732 : INFO : collected 67995 word types from a corpus of 4013071 raw words and 362579 sentences\n", "2024-06-16 03:15:28,733 : INFO : Creating a fresh vocabulary\n", "2024-06-16 03:15:29,065 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 40255 unique words (59.20% of original 67995, drops 27740)', 'datetime': '2024-06-16T03:15:29.048822', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", "2024-06-16 03:15:29,066 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 3985331 word corpus (99.31% of original 4013071, drops 27740)', 'datetime': '2024-06-16T03:15:29.066649', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", "2024-06-16 03:15:29,499 : INFO : deleting the raw counts dictionary of 67995 items\n", "2024-06-16 03:15:29,502 : INFO : sample=0.001 downsamples 30 most-common words\n", "2024-06-16 03:15:29,503 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 3707673.978375597 word corpus (93.0%% of prior 3985331)', 'datetime': '2024-06-16T03:15:29.503269', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", "2024-06-16 03:15:30,266 : INFO : estimated required memory for 40255 words and 500 dimensions: 181147500 bytes\n", "2024-06-16 03:15:30,267 : INFO : resetting layer weights\n", "2024-06-16 03:15:30,558 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2024-06-16T03:15:30.558380', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'build_vocab'}\n", "2024-06-16 03:15:30,559 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 40255 vocabulary and 500 features, using sg=1 hs=0 sample=0.001 negative=5 window=2 shrink_windows=True', 'datetime': '2024-06-16T03:15:30.559294', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'train'}\n", "2024-06-16 03:15:31,645 : INFO : EPOCH 0 - PROGRESS: at 3.31% examples, 175241 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:32,667 : INFO : EPOCH 0 - PROGRESS: at 11.85% examples, 231581 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:33,729 : INFO : EPOCH 0 - PROGRESS: at 18.62% examples, 236754 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:34,753 : INFO : EPOCH 0 - PROGRESS: at 24.47% examples, 246985 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:35,796 : INFO : EPOCH 0 - PROGRESS: at 36.59% examples, 256599 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:36,796 : INFO : EPOCH 0 - PROGRESS: at 46.75% examples, 256986 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:15:37,830 : INFO : EPOCH 0 - PROGRESS: at 55.05% examples, 259692 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:38,868 : INFO : EPOCH 0 - PROGRESS: at 59.24% examples, 261775 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:15:39,868 : INFO : EPOCH 0 - PROGRESS: at 62.57% examples, 261708 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:40,877 : INFO : EPOCH 0 - PROGRESS: at 66.04% examples, 263237 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:41,922 : INFO : EPOCH 0 - PROGRESS: at 74.34% examples, 264375 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:15:42,924 : INFO : EPOCH 0 - PROGRESS: at 84.50% examples, 265285 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:43,935 : INFO : EPOCH 0 - PROGRESS: at 95.74% examples, 265237 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:44,508 : INFO : EPOCH 0: training on 4013071 raw words (3706579 effective words) took 13.9s, 266440 effective words/s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2024-06-16 03:15:45,538 : INFO : EPOCH 1 - PROGRESS: at 4.82% examples, 245175 words/s, in_qsize 5, out_qsize 1\n", "2024-06-16 03:15:46,538 : INFO : EPOCH 1 - PROGRESS: at 13.74% examples, 264455 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:47,560 : INFO : EPOCH 1 - PROGRESS: at 20.30% examples, 271388 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:48,571 : INFO : EPOCH 1 - PROGRESS: at 27.11% examples, 273492 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:15:49,589 : INFO : EPOCH 1 - PROGRESS: at 37.36% examples, 266737 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:15:50,610 : INFO : EPOCH 1 - PROGRESS: at 49.03% examples, 270570 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:51,651 : INFO : EPOCH 1 - PROGRESS: at 55.81% examples, 269964 words/s, in_qsize 3, out_qsize 2\n", "2024-06-16 03:15:52,652 : INFO : EPOCH 1 - PROGRESS: at 59.80% examples, 270869 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:53,694 : INFO : EPOCH 1 - PROGRESS: at 63.30% examples, 270642 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:54,753 : INFO : EPOCH 1 - PROGRESS: at 67.06% examples, 270853 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:55,794 : INFO : EPOCH 1 - PROGRESS: at 75.77% examples, 268905 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:56,813 : INFO : EPOCH 1 - PROGRESS: at 87.02% examples, 271322 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:57,832 : INFO : EPOCH 1 - PROGRESS: at 98.72% examples, 271369 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:15:58,134 : INFO : EPOCH 1: training on 4013071 raw words (3707481 effective words) took 13.6s, 272298 effective words/s\n", "2024-06-16 03:15:59,190 : INFO : EPOCH 2 - PROGRESS: at 4.90% examples, 238625 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:16:00,191 : INFO : EPOCH 2 - PROGRESS: at 13.74% examples, 260706 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:01,207 : INFO : EPOCH 2 - PROGRESS: at 19.69% examples, 260333 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:02,207 : INFO : EPOCH 2 - PROGRESS: at 25.56% examples, 261725 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:03,234 : INFO : EPOCH 2 - PROGRESS: at 37.33% examples, 265675 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:04,250 : INFO : EPOCH 2 - PROGRESS: at 48.30% examples, 266960 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:05,339 : INFO : EPOCH 2 - PROGRESS: at 55.19% examples, 262437 words/s, in_qsize 3, out_qsize 2\n", "2024-06-16 03:16:06,359 : INFO : EPOCH 2 - PROGRESS: at 58.68% examples, 258004 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:16:07,365 : INFO : EPOCH 2 - PROGRESS: at 61.57% examples, 255105 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:16:08,426 : INFO : EPOCH 2 - PROGRESS: at 65.03% examples, 255134 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:09,446 : INFO : EPOCH 2 - PROGRESS: at 69.40% examples, 254291 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:10,471 : INFO : EPOCH 2 - PROGRESS: at 81.05% examples, 257874 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:11,491 : INFO : EPOCH 2 - PROGRESS: at 91.69% examples, 258851 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:12,338 : INFO : EPOCH 2: training on 4013071 raw words (3707528 effective words) took 14.2s, 261187 effective words/s\n", "2024-06-16 03:16:13,375 : INFO : EPOCH 3 - PROGRESS: at 4.15% examples, 214882 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:14,381 : INFO : EPOCH 3 - PROGRESS: at 10.86% examples, 222177 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:15,407 : INFO : EPOCH 3 - PROGRESS: at 17.63% examples, 227504 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:16:16,434 : INFO : EPOCH 3 - PROGRESS: at 22.14% examples, 224431 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:16:17,437 : INFO : EPOCH 3 - PROGRESS: at 27.69% examples, 221242 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:18,443 : INFO : EPOCH 3 - PROGRESS: at 37.76% examples, 224767 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:19,444 : INFO : EPOCH 3 - PROGRESS: at 46.77% examples, 224511 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:16:20,507 : INFO : EPOCH 3 - PROGRESS: at 54.22% examples, 226818 words/s, in_qsize 6, out_qsize 1\n", "2024-06-16 03:16:21,534 : INFO : EPOCH 3 - PROGRESS: at 58.79% examples, 231683 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:22,541 : INFO : EPOCH 3 - PROGRESS: at 61.92% examples, 233456 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:23,544 : INFO : EPOCH 3 - PROGRESS: at 64.52% examples, 230900 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:24,567 : INFO : EPOCH 3 - PROGRESS: at 68.05% examples, 232123 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:16:25,606 : INFO : EPOCH 3 - PROGRESS: at 76.77% examples, 230723 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:26,610 : INFO : EPOCH 3 - PROGRESS: at 86.37% examples, 232550 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:27,619 : INFO : EPOCH 3 - PROGRESS: at 95.34% examples, 231091 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:16:28,232 : INFO : EPOCH 3: training on 4013071 raw words (3707859 effective words) took 15.9s, 233372 effective words/s\n", "2024-06-16 03:16:29,237 : INFO : EPOCH 4 - PROGRESS: at 3.66% examples, 202153 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:30,249 : INFO : EPOCH 4 - PROGRESS: at 9.25% examples, 201858 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:31,265 : INFO : EPOCH 4 - PROGRESS: at 16.41% examples, 211957 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:32,302 : INFO : EPOCH 4 - PROGRESS: at 20.89% examples, 209733 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:16:33,338 : INFO : EPOCH 4 - PROGRESS: at 24.80% examples, 202976 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:16:34,352 : INFO : EPOCH 4 - PROGRESS: at 31.25% examples, 201556 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:35,404 : INFO : EPOCH 4 - PROGRESS: at 38.49% examples, 198993 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:36,419 : INFO : EPOCH 4 - PROGRESS: at 49.03% examples, 201321 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:37,456 : INFO : EPOCH 4 - PROGRESS: at 54.64% examples, 202724 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:38,505 : INFO : EPOCH 4 - PROGRESS: at 58.45% examples, 204538 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:39,580 : INFO : EPOCH 4 - PROGRESS: at 61.92% examples, 209768 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:16:40,591 : INFO : EPOCH 4 - PROGRESS: at 64.80% examples, 210755 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:41,604 : INFO : EPOCH 4 - PROGRESS: at 68.06% examples, 212147 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:16:42,605 : INFO : EPOCH 4 - PROGRESS: at 78.95% examples, 216708 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:43,606 : INFO : EPOCH 4 - PROGRESS: at 87.31% examples, 217568 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:44,611 : INFO : EPOCH 4 - PROGRESS: at 98.97% examples, 221720 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:16:44,891 : INFO : EPOCH 4: training on 4013071 raw words (3706946 effective words) took 16.7s, 222554 effective words/s\n", "2024-06-16 03:16:44,892 : INFO : Word2Vec lifecycle event {'msg': 'training on 20065355 raw words (18536393 effective words) took 74.3s, 249375 effective words/s', 'datetime': '2024-06-16T03:16:44.892076', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'train'}\n", "2024-06-16 03:16:44,895 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec', 'datetime': '2024-06-16T03:16:44.895721', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'created'}\n", "2024-06-16 03:16:44,896 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'skipgram_500_2.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-06-16T03:16:44.896765', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'saving'}\n", "2024-06-16 03:16:44,897 : INFO : storing np array 'vectors' to skipgram_500_2.model.wv.vectors.npy\n", "2024-06-16 03:16:45,009 : INFO : storing np array 'syn1neg' to skipgram_500_2.model.syn1neg.npy\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2024-06-16 03:16:45,056 : INFO : not storing attribute cum_table\n", "2024-06-16 03:16:45,084 : INFO : saved skipgram_500_2.model\n" ] } ], "source": [ "modelLNT1 = gensim.models.Word2Vec(data, vector_size=500, window=2, min_count=2, sg=1) # comparable with web_mystem_skipgram_500_2_2015.bin\n", "modelLNT1.save('skipgram_500_2.model') # modelLNT1 = Word2Vec.load(\"skipgram_500_2.model\")" ] }, { "cell_type": "code", "execution_count": 29, "id": "0a7ec57a", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:17:48.857219Z", "start_time": "2024-06-16T00:16:56.047446Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-06-16 03:16:56,049 : INFO : collecting all words and their counts\n", "2024-06-16 03:16:56,052 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", "2024-06-16 03:16:56,262 : INFO : PROGRESS: at sentence #10000, processed 163169 words, keeping 17313 word types\n", "2024-06-16 03:16:56,392 : INFO : PROGRESS: at sentence #20000, processed 296024 words, keeping 23165 word types\n", "2024-06-16 03:16:56,513 : INFO : PROGRESS: at sentence #30000, processed 405629 words, keeping 26267 word types\n", "2024-06-16 03:16:56,602 : INFO : PROGRESS: at sentence #40000, processed 494131 words, keeping 27548 word types\n", "2024-06-16 03:16:56,705 : INFO : PROGRESS: at sentence #50000, processed 582330 words, keeping 28502 word types\n", "2024-06-16 03:16:56,833 : INFO : PROGRESS: at sentence #60000, processed 706637 words, keeping 30127 word types\n", "2024-06-16 03:16:56,977 : INFO : PROGRESS: at sentence #70000, processed 847070 words, keeping 35040 word types\n", "2024-06-16 03:16:57,114 : INFO : PROGRESS: at sentence #80000, processed 993651 words, keeping 36885 word types\n", "2024-06-16 03:16:57,270 : INFO : PROGRESS: at sentence #90000, processed 1136268 words, keeping 38211 word types\n", "2024-06-16 03:16:57,381 : INFO : PROGRESS: at sentence #100000, processed 1224248 words, keeping 38914 word types\n", "2024-06-16 03:16:57,483 : INFO : PROGRESS: at sentence #110000, processed 1315622 words, keeping 39146 word types\n", "2024-06-16 03:16:57,597 : INFO : PROGRESS: at sentence #120000, processed 1402410 words, keeping 41123 word types\n", "2024-06-16 03:16:57,668 : INFO : PROGRESS: at sentence #130000, processed 1445449 words, keeping 43388 word types\n", "2024-06-16 03:16:57,791 : INFO : PROGRESS: at sentence #140000, processed 1568666 words, keeping 45060 word types\n", "2024-06-16 03:16:57,871 : INFO : PROGRESS: at sentence #150000, processed 1625965 words, keeping 46676 word types\n", "2024-06-16 03:16:57,967 : INFO : PROGRESS: at sentence #160000, processed 1676634 words, keeping 47593 word types\n", "2024-06-16 03:16:58,070 : INFO : PROGRESS: at sentence #170000, processed 1740516 words, keeping 48188 word types\n", "2024-06-16 03:16:58,181 : INFO : PROGRESS: at sentence #180000, processed 1813933 words, keeping 48617 word types\n", "2024-06-16 03:16:58,277 : INFO : PROGRESS: at sentence #190000, processed 1873429 words, keeping 49050 word types\n", "2024-06-16 03:16:58,476 : INFO : PROGRESS: at sentence #200000, processed 2053959 words, keeping 51906 word types\n", "2024-06-16 03:16:58,643 : INFO : PROGRESS: at sentence #210000, processed 2244539 words, keeping 54019 word types\n", "2024-06-16 03:16:58,832 : INFO : PROGRESS: at sentence #220000, processed 2470575 words, keeping 56680 word types\n", "2024-06-16 03:16:59,029 : INFO : PROGRESS: at sentence #230000, processed 2707756 words, keeping 58963 word types\n", "2024-06-16 03:16:59,219 : INFO : PROGRESS: at sentence #240000, processed 2944125 words, keeping 60490 word types\n", "2024-06-16 03:16:59,368 : INFO : PROGRESS: at sentence #250000, processed 3100643 words, keeping 61888 word types\n", "2024-06-16 03:16:59,462 : INFO : PROGRESS: at sentence #260000, processed 3175131 words, keeping 62638 word types\n", "2024-06-16 03:16:59,551 : INFO : PROGRESS: at sentence #270000, processed 3246248 words, keeping 63144 word types\n", "2024-06-16 03:16:59,650 : INFO : PROGRESS: at sentence #280000, processed 3324748 words, keeping 63561 word types\n", "2024-06-16 03:16:59,750 : INFO : PROGRESS: at sentence #290000, processed 3406817 words, keeping 64030 word types\n", "2024-06-16 03:16:59,852 : INFO : PROGRESS: at sentence #300000, processed 3491208 words, keeping 64525 word types\n", "2024-06-16 03:16:59,946 : INFO : PROGRESS: at sentence #310000, processed 3567554 words, keeping 64999 word types\n", "2024-06-16 03:17:00,056 : INFO : PROGRESS: at sentence #320000, processed 3653814 words, keeping 65369 word types\n", "2024-06-16 03:17:00,141 : INFO : PROGRESS: at sentence #330000, processed 3727990 words, keeping 65832 word types\n", "2024-06-16 03:17:00,219 : INFO : PROGRESS: at sentence #340000, processed 3785783 words, keeping 66225 word types\n", "2024-06-16 03:17:00,314 : INFO : PROGRESS: at sentence #350000, processed 3851922 words, keeping 66596 word types\n", "2024-06-16 03:17:00,423 : INFO : PROGRESS: at sentence #360000, processed 3955668 words, keeping 67421 word types\n", "2024-06-16 03:17:00,477 : INFO : collected 67995 word types from a corpus of 4013071 raw words and 362579 sentences\n", "2024-06-16 03:17:00,478 : INFO : Creating a fresh vocabulary\n", "2024-06-16 03:17:00,844 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 40255 unique words (59.20% of original 67995, drops 27740)', 'datetime': '2024-06-16T03:17:00.844097', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", "2024-06-16 03:17:00,845 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 3985331 word corpus (99.31% of original 4013071, drops 27740)', 'datetime': '2024-06-16T03:17:00.845547', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", "2024-06-16 03:17:01,316 : INFO : deleting the raw counts dictionary of 67995 items\n", "2024-06-16 03:17:01,318 : INFO : sample=0.001 downsamples 30 most-common words\n", "2024-06-16 03:17:01,320 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 3707673.978375597 word corpus (93.0%% of prior 3985331)', 'datetime': '2024-06-16T03:17:01.320105', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", "2024-06-16 03:17:02,054 : INFO : estimated required memory for 40255 words and 300 dimensions: 116739500 bytes\n", "2024-06-16 03:17:02,055 : INFO : resetting layer weights\n", "2024-06-16 03:17:02,261 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2024-06-16T03:17:02.261886', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'build_vocab'}\n", "2024-06-16 03:17:02,262 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 40255 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10 shrink_windows=True', 'datetime': '2024-06-16T03:17:02.262796', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'train'}\n", "2024-06-16 03:17:03,289 : INFO : EPOCH 0 - PROGRESS: at 7.88% examples, 359855 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:04,312 : INFO : EPOCH 0 - PROGRESS: at 20.90% examples, 420774 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:05,346 : INFO : EPOCH 0 - PROGRESS: at 32.77% examples, 417947 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:06,372 : INFO : EPOCH 0 - PROGRESS: at 52.69% examples, 425485 words/s, in_qsize 4, out_qsize 2\n", "2024-06-16 03:17:07,393 : INFO : EPOCH 0 - PROGRESS: at 60.15% examples, 436648 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:17:08,400 : INFO : EPOCH 0 - PROGRESS: at 66.70% examples, 450158 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:09,415 : INFO : EPOCH 0 - PROGRESS: at 81.39% examples, 447094 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:10,423 : INFO : EPOCH 0 - PROGRESS: at 97.78% examples, 440450 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:10,718 : INFO : EPOCH 0: training on 4013071 raw words (3707249 effective words) took 8.4s, 439586 effective words/s\n", "2024-06-16 03:17:11,751 : INFO : EPOCH 1 - PROGRESS: at 4.82% examples, 245164 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:12,762 : INFO : EPOCH 1 - PROGRESS: at 11.53% examples, 231748 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:13,770 : INFO : EPOCH 1 - PROGRESS: at 19.14% examples, 253363 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:17:14,817 : INFO : EPOCH 1 - PROGRESS: at 27.11% examples, 271322 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:17:15,819 : INFO : EPOCH 1 - PROGRESS: at 39.77% examples, 287807 words/s, in_qsize 5, out_qsize 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2024-06-16 03:17:16,847 : INFO : EPOCH 1 - PROGRESS: at 55.19% examples, 308798 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:17,859 : INFO : EPOCH 1 - PROGRESS: at 62.24% examples, 336579 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:18,881 : INFO : EPOCH 1 - PROGRESS: at 66.86% examples, 337849 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:19,889 : INFO : EPOCH 1 - PROGRESS: at 79.56% examples, 342120 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:20,947 : INFO : EPOCH 1 - PROGRESS: at 87.02% examples, 326518 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:17:21,959 : INFO : EPOCH 1 - PROGRESS: at 100.00% examples, 330219 words/s, in_qsize 0, out_qsize 1\n", "2024-06-16 03:17:21,961 : INFO : EPOCH 1: training on 4013071 raw words (3707615 effective words) took 11.2s, 330162 effective words/s\n", "2024-06-16 03:17:22,981 : INFO : EPOCH 2 - PROGRESS: at 11.53% examples, 467727 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:23,992 : INFO : EPOCH 2 - PROGRESS: at 22.81% examples, 468651 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:17:25,012 : INFO : EPOCH 2 - PROGRESS: at 36.59% examples, 439519 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:26,046 : INFO : EPOCH 2 - PROGRESS: at 53.83% examples, 447931 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:17:27,054 : INFO : EPOCH 2 - PROGRESS: at 60.26% examples, 441219 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:28,061 : INFO : EPOCH 2 - PROGRESS: at 67.06% examples, 455493 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:29,087 : INFO : EPOCH 2 - PROGRESS: at 81.78% examples, 449594 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:30,098 : INFO : EPOCH 2 - PROGRESS: at 99.61% examples, 452851 words/s, in_qsize 3, out_qsize 1\n", "2024-06-16 03:17:30,133 : INFO : EPOCH 2: training on 4013071 raw words (3707669 effective words) took 8.2s, 454472 effective words/s\n", "2024-06-16 03:17:31,161 : INFO : EPOCH 3 - PROGRESS: at 12.18% examples, 477366 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:32,209 : INFO : EPOCH 3 - PROGRESS: at 21.96% examples, 438616 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:17:33,241 : INFO : EPOCH 3 - PROGRESS: at 31.91% examples, 406354 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:34,258 : INFO : EPOCH 3 - PROGRESS: at 52.60% examples, 419940 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:35,271 : INFO : EPOCH 3 - PROGRESS: at 59.11% examples, 420036 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:36,279 : INFO : EPOCH 3 - PROGRESS: at 64.67% examples, 422528 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:37,294 : INFO : EPOCH 3 - PROGRESS: at 76.41% examples, 426170 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:38,349 : INFO : EPOCH 3 - PROGRESS: at 87.92% examples, 409510 words/s, in_qsize 4, out_qsize 1\n", "2024-06-16 03:17:39,361 : INFO : EPOCH 3 - PROGRESS: at 98.97% examples, 393706 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:39,471 : INFO : EPOCH 3: training on 4013071 raw words (3707281 effective words) took 9.3s, 397223 effective words/s\n", "2024-06-16 03:17:40,477 : INFO : EPOCH 4 - PROGRESS: at 7.88% examples, 360534 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:41,484 : INFO : EPOCH 4 - PROGRESS: at 18.81% examples, 373825 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:42,505 : INFO : EPOCH 4 - PROGRESS: at 30.42% examples, 398180 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:43,509 : INFO : EPOCH 4 - PROGRESS: at 45.52% examples, 388348 words/s, in_qsize 6, out_qsize 0\n", "2024-06-16 03:17:44,574 : INFO : EPOCH 4 - PROGRESS: at 53.82% examples, 357575 words/s, in_qsize 3, out_qsize 2\n", "2024-06-16 03:17:45,583 : INFO : EPOCH 4 - PROGRESS: at 60.26% examples, 366828 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:46,604 : INFO : EPOCH 4 - PROGRESS: at 66.70% examples, 386252 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:47,625 : INFO : EPOCH 4 - PROGRESS: at 81.78% examples, 392361 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:48,629 : INFO : EPOCH 4 - PROGRESS: at 99.09% examples, 397753 words/s, in_qsize 5, out_qsize 0\n", "2024-06-16 03:17:48,744 : INFO : EPOCH 4: training on 4013071 raw words (3707868 effective words) took 9.3s, 399986 effective words/s\n", "2024-06-16 03:17:48,745 : INFO : Word2Vec lifecycle event {'msg': 'training on 20065355 raw words (18537682 effective words) took 46.5s, 398815 effective words/s', 'datetime': '2024-06-16T03:17:48.745392', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'train'}\n", "2024-06-16 03:17:48,746 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec', 'datetime': '2024-06-16T03:17:48.746089', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'created'}\n", "2024-06-16 03:17:48,749 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'cbow_300_10.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2024-06-16T03:17:48.749250', 'gensim': '4.3.2', 'python': '3.8.10 (default, Nov 22 2023, 10:22:35) \\n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-182-generic-x86_64-with-glibc2.29', 'event': 'saving'}\n", "2024-06-16 03:17:48,750 : INFO : storing np array 'vectors' to cbow_300_10.model.wv.vectors.npy\n", "2024-06-16 03:17:48,792 : INFO : storing np array 'syn1neg' to cbow_300_10.model.syn1neg.npy\n", "2024-06-16 03:17:48,821 : INFO : not storing attribute cum_table\n", "2024-06-16 03:17:48,852 : INFO : saved cbow_300_10.model\n" ] } ], "source": [ "modelLNT2 = gensim.models.Word2Vec(data, vector_size=300, window=10, min_count=2, sg=0) # comparable with ruwikiruscorpora_upos_cbow_300_10_2021\n", "modelLNT2.save('cbow_300_10.model')" ] }, { "cell_type": "markdown", "id": "e3a7d2c3", "metadata": {}, "source": [ "# most similar words viz" ] }, { "cell_type": "code", "execution_count": 33, "id": "d443adf9", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:25:19.723274Z", "start_time": "2024-06-16T00:25:19.698591Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "import seaborn as sns\n", "sns.set_style(\"darkgrid\")\n", "\n", "from sklearn.decomposition import PCA\n", "from sklearn.manifold import TSNE" ] }, { "cell_type": "code", "execution_count": 34, "id": "1c88ab62", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:25:24.532880Z", "start_time": "2024-06-16T00:25:24.513702Z" } }, "outputs": [], "source": [ "def tsnescatterplot(model, word, list_names):\n", " \"\"\" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,\n", " its list of most similar words, and a list of words.\n", " \"\"\"\n", " arrays = np.empty((0, 300), dtype='f')\n", " word_labels = [word]\n", " color_list = ['red']\n", "\n", " # adds the vector of the query word\n", " arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)\n", " \n", " # gets list of most similar words\n", " close_words = model.wv.most_similar([word])\n", " \n", " # adds the vector for each of the closest words to the array\n", " for wrd_score in close_words:\n", " wrd_vector = model.wv.__getitem__([wrd_score[0]])\n", " word_labels.append(wrd_score[0])\n", " color_list.append('blue')\n", " arrays = np.append(arrays, wrd_vector, axis=0)\n", " \n", " # adds the vector for each of the words from list_names to the array\n", " for wrd in list_names:\n", " wrd_vector = model.wv.__getitem__([wrd])\n", " word_labels.append(wrd)\n", " color_list.append('green')\n", " arrays = np.append(arrays, wrd_vector, axis=0)\n", " \n", " # Reduces the dimensionality from 300 to 50 dimensions with PCA\n", " reduc = PCA(n_components=20).fit_transform(arrays)\n", " \n", " # Finds t-SNE coordinates for 2 dimensions\n", " np.set_printoptions(suppress=True)\n", " \n", " Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)\n", " \n", " # Sets everything up to plot\n", " df = pd.DataFrame({'x': [x for x in Y[:, 0]],\n", " 'y': [y for y in Y[:, 1]],\n", " 'words': word_labels,\n", " 'color': color_list})\n", " \n", " fig, _ = plt.subplots()\n", " fig.set_size_inches(9, 9)\n", " \n", " # Basic plot\n", " p1 = sns.regplot(data=df,\n", " x=\"x\",\n", " y=\"y\",\n", " fit_reg=False,\n", " marker=\"o\",\n", " scatter_kws={'s': 40,\n", " 'facecolors': df['color']\n", " }\n", " )\n", " \n", " # Adds annotations one by one with a loop\n", " for line in range(0, df.shape[0]):\n", " p1.text(df[\"x\"][line],\n", " df['y'][line],\n", " ' ' + df[\"words\"][line].title(),\n", " horizontalalignment='left',\n", " verticalalignment='bottom', size='medium',\n", " color=df['color'][line],\n", " weight='normal'\n", " ).set_size(15)\n", "\n", " \n", " plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)\n", " plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)\n", " \n", " plt.title('t-SNE visualization for {}'.format(word.title()))" ] }, { "cell_type": "code", "execution_count": 35, "id": "97bdf62c", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:25:26.611407Z", "start_time": "2024-06-16T00:25:25.364703Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Exception ignored on calling ctypes callback function: .match_module_callback at 0x7fa0f8423f70>\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 400, in match_module_callback\n", " self._make_module_from_path(filepath)\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n", " module = module_class(filepath, prefix, user_api, internal_api)\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 606, in __init__\n", " self.version = self.get_version()\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 646, in get_version\n", " config = get_config().split()\n", "AttributeError: 'NoneType' object has no attribute 'split'\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tsnescatterplot(modelLNT2, 'бог_S', [i[0] for i in modelLNT2.wv.most_similar(negative=[\"бог_S\"])])" ] }, { "cell_type": "code", "execution_count": 36, "id": "45b23917", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:25:50.147338Z", "start_time": "2024-06-16T00:25:49.258384Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Exception ignored on calling ctypes callback function: .match_module_callback at 0x7fa0fc640dc0>\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 400, in match_module_callback\n", " self._make_module_from_path(filepath)\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n", " module = module_class(filepath, prefix, user_api, internal_api)\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 606, in __init__\n", " self.version = self.get_version()\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 646, in get_version\n", " config = get_config().split()\n", "AttributeError: 'NoneType' object has no attribute 'split'\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tsnescatterplot(modelLNT2, 'жизнь_S', [i[0] for i in modelLNT2.wv.most_similar(negative=[\"жизнь_S\"])])" ] }, { "cell_type": "code", "execution_count": 40, "id": "d74cd56f", "metadata": { "ExecuteTime": { "end_time": "2024-06-16T00:27:15.367573Z", "start_time": "2024-06-16T00:27:14.269526Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Exception ignored on calling ctypes callback function: .match_module_callback at 0x7fa0fa6c4280>\n", "Traceback (most recent call last):\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 400, in match_module_callback\n", " self._make_module_from_path(filepath)\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n", " module = module_class(filepath, prefix, user_api, internal_api)\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 606, in __init__\n", " self.version = self.get_version()\n", " File \"/usr/local/lib/python3.8/dist-packages/threadpoolctl.py\", line 646, in get_version\n", " config = get_config().split()\n", "AttributeError: 'NoneType' object has no attribute 'split'\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "tsnescatterplot(modelLNT2, 'друг_S', [i[0] for i in modelLNT2.wv.most_similar(negative=[\"друг_S\"])])" ] }, { "cell_type": "code", "execution_count": null, "id": "3be42745", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": true, "title_cell": "Содержание", "title_sidebar": "Contents", "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": true }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }