Christina Theodoris
commited on
Commit
·
abdf980
1
Parent(s):
50e921d
Add error for no files found and suppress loompy import warning
Browse files- geneformer/tokenizer.py +13 -0
geneformer/tokenizer.py
CHANGED
@@ -17,10 +17,17 @@ Usage:
|
|
17 |
import pickle
|
18 |
from pathlib import Path
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
import loompy as lp
|
21 |
import numpy as np
|
22 |
from datasets import Dataset
|
23 |
|
|
|
|
|
24 |
GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary.pkl"
|
25 |
TOKEN_DICTIONARY_FILE = Path(__file__).parent / "token_dictionary.pkl"
|
26 |
|
@@ -111,7 +118,9 @@ class TranscriptomeTokenizer:
|
|
111 |
cell_metadata = {attr_key: [] for attr_key in self.custom_attr_name_dict.values()}
|
112 |
|
113 |
# loops through directories to tokenize .loom files
|
|
|
114 |
for loom_file_path in loom_data_directory.glob("*.loom"):
|
|
|
115 |
print(f"Tokenizing {loom_file_path}")
|
116 |
file_tokenized_cells, file_cell_metadata = self.tokenize_file(
|
117 |
loom_file_path
|
@@ -123,6 +132,10 @@ class TranscriptomeTokenizer:
|
|
123 |
else:
|
124 |
cell_metadata = None
|
125 |
|
|
|
|
|
|
|
|
|
126 |
return tokenized_cells, cell_metadata
|
127 |
|
128 |
def tokenize_file(self, loom_file_path):
|
|
|
17 |
import pickle
|
18 |
from pathlib import Path
|
19 |
|
20 |
+
import logging
|
21 |
+
|
22 |
+
import warnings
|
23 |
+
warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
|
24 |
+
|
25 |
import loompy as lp
|
26 |
import numpy as np
|
27 |
from datasets import Dataset
|
28 |
|
29 |
+
logger = logging.getLogger(__name__)
|
30 |
+
|
31 |
GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary.pkl"
|
32 |
TOKEN_DICTIONARY_FILE = Path(__file__).parent / "token_dictionary.pkl"
|
33 |
|
|
|
118 |
cell_metadata = {attr_key: [] for attr_key in self.custom_attr_name_dict.values()}
|
119 |
|
120 |
# loops through directories to tokenize .loom files
|
121 |
+
file_found = 0
|
122 |
for loom_file_path in loom_data_directory.glob("*.loom"):
|
123 |
+
file_found = 1
|
124 |
print(f"Tokenizing {loom_file_path}")
|
125 |
file_tokenized_cells, file_cell_metadata = self.tokenize_file(
|
126 |
loom_file_path
|
|
|
132 |
else:
|
133 |
cell_metadata = None
|
134 |
|
135 |
+
if file_found == 0:
|
136 |
+
logger.error(
|
137 |
+
f"No .loom files found in directory {loom_data_directory}.")
|
138 |
+
raise
|
139 |
return tokenized_cells, cell_metadata
|
140 |
|
141 |
def tokenize_file(self, loom_file_path):
|