--- license: mit language: - ru tags: - natural-language-processing - dh - word2vec --- The model is built on Leo Tolstoy's [collected works](https://github.com/tolstoydigital/TEI) and represents his individual semantics ## Preparation All texts are converted from the TEI markup, splitted into sentences and lemmatized. Only modern orthography left in the data. ```python import html import os import re import shutil from bs4 import BeautifulSoup !pip install razdel # for splitting from razdel import sentenize from tqdm import tqdm !git clone https://github.com/tolstoydigital/TEI.git relevant_dirs = ['diaries', 'letters', 'notes', 'works'] path = 'TEI/reference/bibllist_works.xml' # allows to work with fiction and non fiction separately xml = open(path).read() soup = BeautifulSoup(xml, features="xml") group_texts = {} for it in soup.find_all("item"): ref = it.find("ref") for related in it.find_all("relatedItem"): for ref_ana in related.find_all("ref"): group_texts[ref_ana.text] = ref.text prefix_texts = 'extracted_texts' os.mkdir(prefix_texts) if os.path.exists(prefix_texts): shutil.rmtree(prefix_texts) os.mkdir(prefix_texts) # extract texts from XML complex_texts = {} for rel_dir in relevant_dirs: path = os.path.join('TEI/texts', rel_dir) for file in tqdm(sorted(os.listdir(path))): fiction = 0 if not file.endswith('.xml'): continue xml = open(os.path.join(path, file)).read() if 'Печатные варианты' in xml: continue nameID = file.replace('.xml', '') soup = BeautifulSoup(xml, features="xml") if soup.find("catRef", {"ana":"#fiction"}): fiction = 1 s = soup.find("body") paragraphs = [] for erase in s.find_all(["orig", "comments", "sic", "note"]): erase.decompose() for p in s.find_all(["p", "l"]): paragraphs.append(html.unescape(p.text.replace('\n', ' ').strip())) if not fiction: with open(os.path.join(prefix_texts, rel_dir + '.txt'), 'a') as f: for par in paragraphs: par = re.sub(' ([.,;:!?)"»])', '\\1', par) par = par.replace('\n', ' ') par = par.strip() par = re.sub('\s+', ' ', par) par = re.sub('\[.+?\]', '', par) for sent in sentenize(par): f.write(list(sent)[2].strip() + '\n') else: if nameID in group_texts: hyper_name = group_texts[nameID] if hyper_name not in complex_texts: complex_texts[hyper_name] = paragraphs else: complex_texts[hyper_name].extend(paragraphs) else: with open(os.path.join(prefix_texts, nameID + '.txt'), 'w') as f: f.write('\n'.join(paragraphs)) for hyper_name in complex_texts: with open(os.path.join(prefix_texts, hyper_name + '.txt'), 'w') as f: f.write('\n'.join(complex_texts[hyper_name])) # tagging from pymystem3 import Mystem pos = ['S', 'V', 'A', 'ADV'] def tagging(): m = Mystem() for fl in os.listdir(prefix_texts): #print(fl) if 'mystem' in fl: continue with open(os.path.join(prefix_texts, fl)) as f: text = f.read() lines = text.split('\n') ana_lines = [] for line in lines: line = ' '.join(line.split()[1:]) line = line.replace('ò', 'о') line = line.replace('è', 'е') line = line.replace('à', 'а') line = line.replace('ѝ', 'и') line = line.replace('ỳ', 'у') line = line.replace('о̀', 'о') #line = line.replace('Изд.̀', 'издательство') ana = [] info = m.analyze(line) for token in info: if "analysis" in token: try: analysis = token["analysis"][0] except: #print(token) continue # if "lex" in analysis: lex = analysis["lex"] #if 'gr' in analysis: gr = analysis['gr'] #print(gr) const = gr.split('=')[0] if ',' in const: pos = const.split(',')[0] else: pos = const ana.append('{}_{}'.format(lex, pos)) ln = ' '.join(ana) if re.search('[А-Яа-я]', ln): ana_lines.append(ln) with open('{}/mystem-{}'.format(prefix_texts, fl), 'w') as fw: fw.write('\n'.join(ana_lines)) def mk_input(): inp = [] for fl in os.listdir(prefix_texts): if not 'mystem' in fl: continue #print(fl) with open(os.path.join(prefix_texts, fl)) as f: text = f.read() lines = text.split('\n') for line in lines: words = [] for w in line.split(): word = w.split('_') if word[1] in pos: words.append(w) if len(words) > 1: inp.append(' '.join(words)) with open('input.txt', 'w') as fw: fw.write('\n'.join(inp)) tagging() mk_input() ``` The whole code is in the `w2v-prep.ipynb` notebook. ## Models There are 2 models in the repository. Their parameters are taen from the general language models to be comparable from rusvectores site. Here is the code for building models: ```python import sys import logging import gensim logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) pth = './input.txt' data = gensim.models.word2vec.LineSentence(pth) # train sentence by sentence modelLNT1 = gensim.models.Word2Vec(data, vector_size=500, window=2, min_count=2, sg=1) # comparable with web_mystem_skipgram_500_2_2015.bin modelLNT1.save('skipgram_500_2.model') # saving modelLNT2 = gensim.models.Word2Vec(data, vector_size=300, window=10, min_count=2, sg=0) # comparable with ruwikiruscorpora_upos_cbow_300_10_2021 modelLNT2.save('cbow_300_10.model') ``` ## Usage ```python # load models modelLNT1 = Word2Vec.load("skipgram_500_2.model") # most similar words viz import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns sns.set_style("darkgrid") from sklearn.decomposition import PCA from sklearn.manifold import TSNE def tsnescatterplot(model, word, list_names): # stolen code """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word, its list of most similar words, and a list of words. """ arrays = np.empty((0, 300), dtype='f') word_labels = [word] color_list = ['red'] # adds the vector of the query word arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0) # gets list of most similar words close_words = model.wv.most_similar([word]) # adds the vector for each of the closest words to the array for wrd_score in close_words: wrd_vector = model.wv.__getitem__([wrd_score[0]]) word_labels.append(wrd_score[0]) color_list.append('blue') arrays = np.append(arrays, wrd_vector, axis=0) # adds the vector for each of the words from list_names to the array for wrd in list_names: wrd_vector = model.wv.__getitem__([wrd]) word_labels.append(wrd) color_list.append('green') arrays = np.append(arrays, wrd_vector, axis=0) # Reduces the dimensionality from 300 to 50 dimensions with PCA reduc = PCA(n_components=20).fit_transform(arrays) # Finds t-SNE coordinates for 2 dimensions np.set_printoptions(suppress=True) Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc) # Sets everything up to plot df = pd.DataFrame({'x': [x for x in Y[:, 0]], 'y': [y for y in Y[:, 1]], 'words': word_labels, 'color': color_list}) fig, _ = plt.subplots() fig.set_size_inches(9, 9) # Basic plot p1 = sns.regplot(data=df, x="x", y="y", fit_reg=False, marker="o", scatter_kws={'s': 40, 'facecolors': df['color'] } ) # Adds annotations one by one with a loop for line in range(0, df.shape[0]): p1.text(df["x"][line], df['y'][line], ' ' + df["words"][line].title(), horizontalalignment='left', verticalalignment='bottom', size='medium', color=df['color'][line], weight='normal' ).set_size(15) plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50) plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50) plt.title('t-SNE visualization for {}'.format(word.title())) tsnescatterplot(modelLNT2, 'бог_S', [i[0] for i in modelLNT2.wv.most_similar(negative=["бог_S"])]) ``` ![](./god.png) ## Train data Train corpus inclded in this repository as an `input.txt` file. It contains more than 7 mln words. For detailed explanation see Bonch-Osmolovskaya, A., Skorinkin, D., Pavlova, I., Kolbasov, M., & Orekhov, B. (2019). [Tolstoy semanticized: Constructing a digital edition for knowledge discovery](https://www.sciencedirect.com/science/article/abs/pii/S1570826818300635). *Journal of Web Semantics, 59*, 100483. ## Publication Орехов Б. В. [Индивидуальная семантика Л. Н. Толстого в свете векторных моделей](https://human.spbstu.ru/article/2023.54.09/) // Terra Linguistica. 2023. Т. 14. No 4. С. 119–129. DOI: 10.18721/JHSS.14409 ``` @article{орехов2023индивидуальная, title={Индивидуальная семантика Л. Н. Толстого в свете векторных моделей}, author={Орехов, Б.В.}, journal={Terra Linguistica}, volume={14}, number={4}, pages={119--129}, doi={10.18721/JHSS.14409} url={https://human.spbstu.ru/userfiles/files/articles/2023/4/119-129.pdf} year={2023} } ```