Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
ae7097b
1
Parent(s):
4b890a6
defaults defaults defaults (and types)
Browse filesSigned-off-by: Niv Sardi <[email protected]>
- python/common/defaults.py +15 -0
- python/common/mkdir.py +6 -0
- python/entity.py +3 -1
- python/imtool.py +2 -4
- python/main.py +6 -9
- python/screenshot.py +11 -6
- python/vendor.py +2 -5
- python/web.py +6 -3
python/common/defaults.py
CHANGED
@@ -1 +1,16 @@
|
|
1 |
DATA_PATH='./data'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
DATA_PATH='./data'
|
2 |
+
|
3 |
+
LABELS_PATH=f'{DATA_PATH}/labels'
|
4 |
+
IMAGES_PATH=f'{DATA_PATH}/images'
|
5 |
+
CERTS_PATH=f'{DATA_PATH}/certs'
|
6 |
+
|
7 |
+
SQUARES_DATA_PATH=f'{DATA_PATH}/squares'
|
8 |
+
SQUARES_LABELS_PATH=f'{SQUARES_DATA_PATH}/labels'
|
9 |
+
SQUARES_IMAGES_PATH=f'{SQUARES_DATA_PATH}/images'
|
10 |
+
|
11 |
+
DEBUG_PATH=f'{DATA_PATH}/debug'
|
12 |
+
DEBUG_SQUARES_PATH=f'{DEBUG_PATH}/squares'
|
13 |
+
|
14 |
+
LOGOS_DATA_PATH=f'{DATA_PATH}/logos'
|
15 |
+
|
16 |
+
MAIN_CSV_PATH=f'{DATA_PATH}/entities.csv'
|
python/common/mkdir.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pathlib
|
2 |
+
|
3 |
+
def make_dirs(dirs: [str]):
|
4 |
+
for p in dirs:
|
5 |
+
pathlib.Path(p).mkdir(parents=True, exist_ok=True)
|
6 |
+
|
python/entity.py
CHANGED
@@ -2,8 +2,10 @@
|
|
2 |
import csv
|
3 |
from typing import NamedTuple
|
4 |
|
|
|
|
|
5 |
def read_entities(fn):
|
6 |
-
with open(
|
7 |
reader = csv.DictReader(csvfile)
|
8 |
bcos = { d['bco']:update(d, {'id': i}) for i, d in enumerate(reader)}
|
9 |
return bcos
|
|
|
2 |
import csv
|
3 |
from typing import NamedTuple
|
4 |
|
5 |
+
from common import defaults
|
6 |
+
|
7 |
def read_entities(fn):
|
8 |
+
with open(defaults.MAIN_DATA_PATH, newline='') as csvfile:
|
9 |
reader = csv.DictReader(csvfile)
|
10 |
bcos = { d['bco']:update(d, {'id': i}) for i, d in enumerate(reader)}
|
11 |
return bcos
|
python/imtool.py
CHANGED
@@ -3,10 +3,10 @@
|
|
3 |
import os
|
4 |
import math
|
5 |
import cv2
|
6 |
-
import pathlib
|
7 |
from typing import NamedTuple
|
8 |
|
9 |
from entity import Entity
|
|
|
10 |
|
11 |
TILE_SIZE = 416
|
12 |
TILE_OVERLAP = 0.8
|
@@ -69,9 +69,7 @@ def crop(id, fn, logos):
|
|
69 |
img_out = f"./data/squares/images"
|
70 |
txt_out = f"./data/squares/labels"
|
71 |
debug_out = f"./data/debug"
|
72 |
-
|
73 |
-
pathlib.Path(img_out).mkdir(parents=True, exist_ok=True)
|
74 |
-
pathlib.Path(txt_out).mkdir(parents=True, exist_ok=True)
|
75 |
|
76 |
im = cv2.imread(fn)
|
77 |
rim = cv2.imread(fn)
|
|
|
3 |
import os
|
4 |
import math
|
5 |
import cv2
|
|
|
6 |
from typing import NamedTuple
|
7 |
|
8 |
from entity import Entity
|
9 |
+
from common import mkdir
|
10 |
|
11 |
TILE_SIZE = 416
|
12 |
TILE_OVERLAP = 0.8
|
|
|
69 |
img_out = f"./data/squares/images"
|
70 |
txt_out = f"./data/squares/labels"
|
71 |
debug_out = f"./data/debug"
|
72 |
+
mkdir.make_dirs[debug_out, img_out, txt_out]
|
|
|
|
|
73 |
|
74 |
im = cv2.imread(fn)
|
75 |
rim = cv2.imread(fn)
|
python/main.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import csv
|
2 |
-
import pathlib
|
3 |
import requests
|
4 |
import shutil
|
5 |
|
@@ -7,18 +6,16 @@ from bs4 import BeautifulSoup
|
|
7 |
from progress.bar import ChargingBar
|
8 |
|
9 |
from entity import Entity
|
10 |
-
from common import selectors
|
11 |
-
from common import defaults
|
12 |
|
13 |
-
pathlib.Path(f'{defaults.DATA_PATH}/logos').mkdir(parents=True, exist_ok=True)
|
14 |
-
|
15 |
-
DATA_FILE = './data/entidades.csv'
|
16 |
URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
|
17 |
page = requests.get(URL)
|
18 |
soup = BeautifulSoup(page.content, 'html.parser')
|
19 |
|
20 |
options = soup.find(class_='form-control').find_all('option')
|
21 |
-
|
|
|
|
|
22 |
writer = csv.writer(csvfile)
|
23 |
writer.writerow(Entity.row_names())
|
24 |
|
@@ -46,11 +43,11 @@ with open(f'{DATA_FILE}.tmp', 'w', newline='') as csvfile:
|
|
46 |
except TypeError:
|
47 |
print('ERROR', a)
|
48 |
|
49 |
-
e = Entity(name, id=i, bco=bco, logo=img, url=a)
|
50 |
writer.writerow(e.to_row())
|
51 |
i+=1
|
52 |
bar.next()
|
53 |
bar.finish()
|
54 |
|
55 |
-
shutil.move(f'{
|
56 |
print('scrape finished')
|
|
|
1 |
import csv
|
|
|
2 |
import requests
|
3 |
import shutil
|
4 |
|
|
|
6 |
from progress.bar import ChargingBar
|
7 |
|
8 |
from entity import Entity
|
9 |
+
from common import selectors, defaults, mkdir
|
|
|
10 |
|
|
|
|
|
|
|
11 |
URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
|
12 |
page = requests.get(URL)
|
13 |
soup = BeautifulSoup(page.content, 'html.parser')
|
14 |
|
15 |
options = soup.find(class_='form-control').find_all('option')
|
16 |
+
mkdir.make_dirs([defaults.DATA_PATH])
|
17 |
+
|
18 |
+
with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
|
19 |
writer = csv.writer(csvfile)
|
20 |
writer.writerow(Entity.row_names())
|
21 |
|
|
|
43 |
except TypeError:
|
44 |
print('ERROR', a)
|
45 |
|
46 |
+
e = Entity(name, id=i, bco=bco, logo=str(img), url=str(a))
|
47 |
writer.writerow(e.to_row())
|
48 |
i+=1
|
49 |
bar.next()
|
50 |
bar.finish()
|
51 |
|
52 |
+
shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
|
53 |
print('scrape finished')
|
python/screenshot.py
CHANGED
@@ -9,7 +9,7 @@ from selenium.webdriver.common.by import By
|
|
9 |
|
10 |
from common import selectors
|
11 |
from entity import Entity
|
12 |
-
from common import defaults
|
13 |
|
14 |
options = webdriver.FirefoxOptions()
|
15 |
options.add_argument("--headless")
|
@@ -22,18 +22,23 @@ def coord_to_point(c):
|
|
22 |
|
23 |
driver = webdriver.Firefox(options=options)
|
24 |
def sc_entity(e: Entity):
|
25 |
-
print(e)
|
|
|
|
|
|
|
|
|
|
|
26 |
driver.implicitly_wait(10)
|
27 |
driver.get(e.url)
|
28 |
-
driver.save_screenshot(f"{defaults.DATA_PATH}/{e.bco}.png")
|
29 |
-
driver.save_full_page_screenshot(f"{defaults.
|
30 |
|
31 |
logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
|
32 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
|
33 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
|
34 |
-
with open(f"{defaults.
|
35 |
for i in logos:
|
36 |
-
f.write(f"{e.
|
37 |
|
38 |
if __name__ == '__main__':
|
39 |
sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
|
|
|
9 |
|
10 |
from common import selectors
|
11 |
from entity import Entity
|
12 |
+
from common import defaults,mkdir
|
13 |
|
14 |
options = webdriver.FirefoxOptions()
|
15 |
options.add_argument("--headless")
|
|
|
22 |
|
23 |
driver = webdriver.Firefox(options=options)
|
24 |
def sc_entity(e: Entity):
|
25 |
+
print(f'screenshoting: {e}')
|
26 |
+
mkdir.make_dirs([
|
27 |
+
defaults.IMAGES_PATH,
|
28 |
+
defaults.LABELS_PATH,
|
29 |
+
])
|
30 |
+
|
31 |
driver.implicitly_wait(10)
|
32 |
driver.get(e.url)
|
33 |
+
#driver.save_screenshot(f"{defaults.DATA_PATH}/{e.bco}.png")
|
34 |
+
driver.save_full_page_screenshot(f"{defaults.IMAGES_PATH}/{e.bco}.full.png")
|
35 |
|
36 |
logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
|
37 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
|
38 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
|
39 |
+
with open(f"{defaults.LABELS_PATH}/{e.bco}.full.txt", 'w') as f:
|
40 |
for i in logos:
|
41 |
+
f.write(f"{e.id} {coord_to_point(i.rect)}\n")
|
42 |
|
43 |
if __name__ == '__main__':
|
44 |
sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
|
python/vendor.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
-
import pathlib
|
3 |
import csv
|
4 |
import concurrent.futures
|
5 |
import requests
|
@@ -7,7 +6,7 @@ import requests
|
|
7 |
from progress.bar import ChargingBar
|
8 |
|
9 |
from entity import Entity
|
10 |
-
from common import defaults
|
11 |
import screenshot
|
12 |
import web
|
13 |
|
@@ -40,6 +39,4 @@ def from_csv(fn):
|
|
40 |
#exit()
|
41 |
|
42 |
if __name__ == '__main__':
|
43 |
-
|
44 |
-
pathlib.Path(f"{defaults.DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
|
45 |
-
from_csv(f"{defaults.DATA_PATH}/entidades.csv")
|
|
|
1 |
#!/usr/bin/env python3
|
|
|
2 |
import csv
|
3 |
import concurrent.futures
|
4 |
import requests
|
|
|
6 |
from progress.bar import ChargingBar
|
7 |
|
8 |
from entity import Entity
|
9 |
+
from common import defaults,mkdir
|
10 |
import screenshot
|
11 |
import web
|
12 |
|
|
|
39 |
#exit()
|
40 |
|
41 |
if __name__ == '__main__':
|
42 |
+
from_csv(defaults.MAIN_CSV_PATH)
|
|
|
|
python/web.py
CHANGED
@@ -5,7 +5,7 @@ import requests
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
|
7 |
from entity import Entity
|
8 |
-
from common import selectors, defaults
|
9 |
|
10 |
def get_page(e: Entity):
|
11 |
try:
|
@@ -17,9 +17,10 @@ def get_page(e: Entity):
|
|
17 |
|
18 |
def get_cert(e: Entity):
|
19 |
ssl_url = e.url.split("/")[2]
|
|
|
20 |
try:
|
21 |
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
22 |
-
fn = f"{defaults.
|
23 |
with open(fn, 'w') as f:
|
24 |
f.write(cert)
|
25 |
except Exception as err:
|
@@ -39,6 +40,8 @@ def get_logos(e: Entity, page):
|
|
39 |
logos.extend(soup.select(selectors.id_logo))
|
40 |
logos.extend(soup.select(selectors.cls_logo))
|
41 |
|
|
|
|
|
42 |
i = 0
|
43 |
lfn = []
|
44 |
for l in logos:
|
@@ -46,7 +49,7 @@ def get_logos(e: Entity, page):
|
|
46 |
src = l.attrs['src']
|
47 |
ext = src.split('.')[-1].split('/')[-1]
|
48 |
if not src.startswith('http'): src = e.url + src
|
49 |
-
fn = f"{defaults.
|
50 |
lfn.append(get_img_logo(src, fn))
|
51 |
i+=1
|
52 |
return lfn
|
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
|
7 |
from entity import Entity
|
8 |
+
from common import selectors, defaults, mkdir
|
9 |
|
10 |
def get_page(e: Entity):
|
11 |
try:
|
|
|
17 |
|
18 |
def get_cert(e: Entity):
|
19 |
ssl_url = e.url.split("/")[2]
|
20 |
+
mkdir.make_dirs(defaults.CERTS_PATH)
|
21 |
try:
|
22 |
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
23 |
+
fn = f"{defaults.CERTS_PATH}/{e.bco}.cert"
|
24 |
with open(fn, 'w') as f:
|
25 |
f.write(cert)
|
26 |
except Exception as err:
|
|
|
40 |
logos.extend(soup.select(selectors.id_logo))
|
41 |
logos.extend(soup.select(selectors.cls_logo))
|
42 |
|
43 |
+
mkdir.make_dirs(defaults.LOGOS_DATA_PATH)
|
44 |
+
|
45 |
i = 0
|
46 |
lfn = []
|
47 |
for l in logos:
|
|
|
49 |
src = l.attrs['src']
|
50 |
ext = src.split('.')[-1].split('/')[-1]
|
51 |
if not src.startswith('http'): src = e.url + src
|
52 |
+
fn = f"{defaults.LOGOS_DATA_PATH}/{e.bco}.{i}.{ext}"
|
53 |
lfn.append(get_img_logo(src, fn))
|
54 |
i+=1
|
55 |
return lfn
|