Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
60ec487
1
Parent(s):
d6dde3c
make Entity a NamedTuple
Browse filesSigned-off-by: Niv Sardi <[email protected]>
- python/common/defaults.py +1 -0
- python/entity.py +23 -20
- python/main.py +16 -17
- python/screenshot.py +4 -3
- python/vendor.py +5 -5
- python/web.py +23 -16
python/common/defaults.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
DATA_PATH='./data'
|
python/entity.py
CHANGED
@@ -1,16 +1,23 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def __repr__(self):
|
12 |
return f"""
|
13 |
-
Entity:
|
14 |
name: {self.name}
|
15 |
bco: {self.bco}
|
16 |
url: {self.url}
|
@@ -22,26 +29,22 @@ Entity:
|
|
22 |
self = apply(cls, l)
|
23 |
return self
|
24 |
|
|
|
25 |
@classmethod
|
26 |
def from_dict(cls, d):
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
setattr(self, f, d[f])
|
31 |
return self
|
32 |
|
33 |
@classmethod
|
34 |
def row_names(cls):
|
35 |
-
return ['name', 'bco', 'url', 'logo']
|
36 |
-
|
37 |
-
@property
|
38 |
-
def DATA_PATH(self):
|
39 |
-
return self._DATA_PATH
|
40 |
|
41 |
def to_row(self):
|
42 |
-
return [self.name, self.bco, self.url, self.logo]
|
43 |
|
44 |
if __name__ == '__main__':
|
45 |
-
e = Entity.from_dict({'url': 'blah'})
|
46 |
assert(e.url == 'blah')
|
47 |
print(e)
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
+
import csv
|
3 |
+
from typing import NamedTuple
|
4 |
+
|
5 |
+
def read_entities(fn):
|
6 |
+
with open('./data/entidades.csv', newline='') as csvfile:
|
7 |
+
reader = csv.DictReader(csvfile)
|
8 |
+
bcos = { d['bco']:update(d, {'id': i}) for i, d in enumerate(reader)}
|
9 |
+
return bcos
|
10 |
+
|
11 |
+
class Entity(NamedTuple):
|
12 |
+
name: str
|
13 |
+
id: int = 0
|
14 |
+
bco: str = "debug"
|
15 |
+
url: str = None
|
16 |
+
logo: str = None
|
17 |
|
18 |
def __repr__(self):
|
19 |
return f"""
|
20 |
+
Entity {self.id}:
|
21 |
name: {self.name}
|
22 |
bco: {self.bco}
|
23 |
url: {self.url}
|
|
|
29 |
self = apply(cls, l)
|
30 |
return self
|
31 |
|
32 |
+
# this now looks horrible…
|
33 |
@classmethod
|
34 |
def from_dict(cls, d):
|
35 |
+
o = {'name': None, 'id': 0, 'bco': None, 'url': None, 'logo': None}
|
36 |
+
o.update(d)
|
37 |
+
self = cls(o['name'], o['id'], o['bco'], o['url'], o['logo'])
|
|
|
38 |
return self
|
39 |
|
40 |
@classmethod
|
41 |
def row_names(cls):
|
42 |
+
return ['id', 'name', 'bco', 'url', 'logo']
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def to_row(self):
|
45 |
+
return [self.id, self.name, self.bco, self.url, self.logo]
|
46 |
|
47 |
if __name__ == '__main__':
|
48 |
+
e = Entity.from_dict({'name': 'test', 'url': 'blah'})
|
49 |
assert(e.url == 'blah')
|
50 |
print(e)
|
python/main.py
CHANGED
@@ -8,34 +8,32 @@ from progress.bar import ChargingBar
|
|
8 |
|
9 |
from entity import Entity
|
10 |
from common import selectors
|
|
|
11 |
|
12 |
-
pathlib.Path(f
|
13 |
|
14 |
DATA_FILE = './data/entidades.csv'
|
15 |
-
URL =
|
16 |
page = requests.get(URL)
|
17 |
-
soup = BeautifulSoup(page.content,
|
18 |
|
19 |
-
options = soup.find(class_=
|
20 |
-
with open(f
|
21 |
writer = csv.writer(csvfile)
|
22 |
writer.writerow(Entity.row_names())
|
23 |
|
|
|
24 |
bar = ChargingBar('Processing', max=len(options))
|
25 |
for o in options[1:]:
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
)
|
30 |
-
page = requests.post(URL, data={'bco': e.bco})
|
31 |
-
soup = BeautifulSoup(page.content, "html.parser")
|
32 |
try:
|
33 |
img = soup.select_one(selectors.logosbancos).attrs['src']
|
34 |
-
img = img.replace(
|
35 |
except AttributeError as err:
|
36 |
-
print('img',
|
37 |
img = None
|
38 |
-
e.logo = img
|
39 |
|
40 |
a = soup.select_one(selectors.entity_http)
|
41 |
try:
|
@@ -48,10 +46,11 @@ with open(f"{DATA_FILE}.tmp", 'w', newline='') as csvfile:
|
|
48 |
except TypeError:
|
49 |
print('ERROR', a)
|
50 |
|
51 |
-
e
|
52 |
writer.writerow(e.to_row())
|
|
|
53 |
bar.next()
|
54 |
bar.finish()
|
55 |
|
56 |
-
shutil.move(f
|
57 |
-
print(
|
|
|
8 |
|
9 |
from entity import Entity
|
10 |
from common import selectors
|
11 |
+
from common import defaults
|
12 |
|
13 |
+
pathlib.Path(f'{defaults.DATA_PATH}/logos').mkdir(parents=True, exist_ok=True)
|
14 |
|
15 |
DATA_FILE = './data/entidades.csv'
|
16 |
+
URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
|
17 |
page = requests.get(URL)
|
18 |
+
soup = BeautifulSoup(page.content, 'html.parser')
|
19 |
|
20 |
+
options = soup.find(class_='form-control').find_all('option')
|
21 |
+
with open(f'{DATA_FILE}.tmp', 'w', newline='') as csvfile:
|
22 |
writer = csv.writer(csvfile)
|
23 |
writer.writerow(Entity.row_names())
|
24 |
|
25 |
+
i = 0
|
26 |
bar = ChargingBar('Processing', max=len(options))
|
27 |
for o in options[1:]:
|
28 |
+
(name, bco)= (o.text, o.attrs['value'])
|
29 |
+
page = requests.post(URL, data={'bco': bco})
|
30 |
+
soup = BeautifulSoup(page.content, 'html.parser')
|
|
|
|
|
|
|
31 |
try:
|
32 |
img = soup.select_one(selectors.logosbancos).attrs['src']
|
33 |
+
img = img.replace('../', 'https://www.bcra.gob.ar/')
|
34 |
except AttributeError as err:
|
35 |
+
print('img', name, err)
|
36 |
img = None
|
|
|
37 |
|
38 |
a = soup.select_one(selectors.entity_http)
|
39 |
try:
|
|
|
46 |
except TypeError:
|
47 |
print('ERROR', a)
|
48 |
|
49 |
+
e = Entity(name, id=i, bco=bco, logo=img, url=a)
|
50 |
writer.writerow(e.to_row())
|
51 |
+
i+=1
|
52 |
bar.next()
|
53 |
bar.finish()
|
54 |
|
55 |
+
shutil.move(f'{DATA_FILE}.tmp', DATA_FILE)
|
56 |
+
print('scrape finished')
|
python/screenshot.py
CHANGED
@@ -9,6 +9,7 @@ from selenium.webdriver.common.by import By
|
|
9 |
|
10 |
from common import selectors
|
11 |
from entity import Entity
|
|
|
12 |
|
13 |
options = webdriver.FirefoxOptions()
|
14 |
options.add_argument("--headless")
|
@@ -24,13 +25,13 @@ def sc_entity(e: Entity):
|
|
24 |
print(e)
|
25 |
driver.implicitly_wait(10)
|
26 |
driver.get(e.url)
|
27 |
-
driver.save_screenshot(f"{
|
28 |
-
driver.save_full_page_screenshot(f"{
|
29 |
|
30 |
logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
|
31 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
|
32 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
|
33 |
-
with open(f"{
|
34 |
for i in logos:
|
35 |
f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
|
36 |
|
|
|
9 |
|
10 |
from common import selectors
|
11 |
from entity import Entity
|
12 |
+
from common import defaults
|
13 |
|
14 |
options = webdriver.FirefoxOptions()
|
15 |
options.add_argument("--headless")
|
|
|
25 |
print(e)
|
26 |
driver.implicitly_wait(10)
|
27 |
driver.get(e.url)
|
28 |
+
driver.save_screenshot(f"{defaults.DATA_PATH}/{e.bco}.png")
|
29 |
+
driver.save_full_page_screenshot(f"{defaults.DATA_PATH}/{e.bco}.full.png")
|
30 |
|
31 |
logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
|
32 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
|
33 |
logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
|
34 |
+
with open(f"{defaults.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
|
35 |
for i in logos:
|
36 |
f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
|
37 |
|
python/vendor.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
import pathlib
|
3 |
-
|
4 |
-
import shutil
|
5 |
import csv
|
6 |
import concurrent.futures
|
7 |
import requests
|
@@ -9,6 +7,7 @@ import requests
|
|
9 |
from progress.bar import ChargingBar
|
10 |
|
11 |
from entity import Entity
|
|
|
12 |
import screenshot
|
13 |
import web
|
14 |
|
@@ -31,6 +30,7 @@ def from_csv(fn):
|
|
31 |
(cert, logos) = f.result()
|
32 |
except Exception as exc:
|
33 |
print('%r generated an exception: %s' % (url, exc))
|
|
|
34 |
else:
|
35 |
print(cert, logos)
|
36 |
bar.next()
|
@@ -40,6 +40,6 @@ def from_csv(fn):
|
|
40 |
#exit()
|
41 |
|
42 |
if __name__ == '__main__':
|
43 |
-
#pathlib.Path(
|
44 |
-
pathlib.Path(f"{
|
45 |
-
from_csv(f"{
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
import pathlib
|
|
|
|
|
3 |
import csv
|
4 |
import concurrent.futures
|
5 |
import requests
|
|
|
7 |
from progress.bar import ChargingBar
|
8 |
|
9 |
from entity import Entity
|
10 |
+
from common import defaults
|
11 |
import screenshot
|
12 |
import web
|
13 |
|
|
|
30 |
(cert, logos) = f.result()
|
31 |
except Exception as exc:
|
32 |
print('%r generated an exception: %s' % (url, exc))
|
33 |
+
raise
|
34 |
else:
|
35 |
print(cert, logos)
|
36 |
bar.next()
|
|
|
40 |
#exit()
|
41 |
|
42 |
if __name__ == '__main__':
|
43 |
+
#pathlib.Path(defaults.DATA_PATH).mkdir(parents=True, exist_ok=True)
|
44 |
+
pathlib.Path(f"{defaults.DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
|
45 |
+
from_csv(f"{defaults.DATA_PATH}/entidades.csv")
|
python/web.py
CHANGED
@@ -1,22 +1,25 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
import ssl
|
|
|
|
|
3 |
from bs4 import BeautifulSoup
|
4 |
|
5 |
from entity import Entity
|
6 |
-
from common import selectors
|
|
|
7 |
def get_page(e: Entity):
|
8 |
try:
|
9 |
page = requests.get(e.url)
|
10 |
except Exception:
|
11 |
-
|
12 |
-
page = requests.get(
|
13 |
return page
|
14 |
|
15 |
def get_cert(e: Entity):
|
16 |
ssl_url = e.url.split("/")[2]
|
17 |
try:
|
18 |
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
19 |
-
fn = f"{
|
20 |
with open(fn, 'w') as f:
|
21 |
f.write(cert)
|
22 |
except Exception as err:
|
@@ -24,23 +27,27 @@ def get_cert(e: Entity):
|
|
24 |
f.write(str(err))
|
25 |
return fn
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def get_logos(e: Entity, page):
|
28 |
soup = BeautifulSoup(page.content, "html.parser")
|
29 |
-
logos = soup.select(selectors.
|
|
|
|
|
30 |
|
31 |
i = 0
|
32 |
lfn = []
|
33 |
for l in logos:
|
34 |
-
src
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
except Exception:
|
39 |
-
res = requests.get(f"{e.url}/{src}")
|
40 |
-
|
41 |
-
fn = f"{e.DATA_PATH}/logos/{e.bco}.{i}.{ext}"
|
42 |
-
with open(fn, "wb") as f:
|
43 |
-
shutil.copyfileobj(res.raw, f)
|
44 |
-
lfn.append(fn)
|
45 |
i+=1
|
46 |
return lfn
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
import ssl
|
3 |
+
import shutil
|
4 |
+
import requests
|
5 |
from bs4 import BeautifulSoup
|
6 |
|
7 |
from entity import Entity
|
8 |
+
from common import selectors, defaults
|
9 |
+
|
10 |
def get_page(e: Entity):
|
11 |
try:
|
12 |
page = requests.get(e.url)
|
13 |
except Exception:
|
14 |
+
url = e.url.replace('http', 'https')
|
15 |
+
page = requests.get(url)
|
16 |
return page
|
17 |
|
18 |
def get_cert(e: Entity):
|
19 |
ssl_url = e.url.split("/")[2]
|
20 |
try:
|
21 |
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
22 |
+
fn = f"{defaults.DATA_PATH}/{e.bco}.cert"
|
23 |
with open(fn, 'w') as f:
|
24 |
f.write(cert)
|
25 |
except Exception as err:
|
|
|
27 |
f.write(str(err))
|
28 |
return fn
|
29 |
|
30 |
+
def get_img_logo(src: string):
|
31 |
+
ext = src.split('.')[-1].split('/')[-1]
|
32 |
+
res = requests.get(src, stream=True)
|
33 |
+
|
34 |
+
fn = f"{defaults.DATA_PATH}/logos/{e.bco}.{i}.{ext}"
|
35 |
+
with open(fn, "wb") as f:
|
36 |
+
shutil.copyfileobj(res.raw, f)
|
37 |
+
return fn
|
38 |
+
|
39 |
def get_logos(e: Entity, page):
|
40 |
soup = BeautifulSoup(page.content, "html.parser")
|
41 |
+
logos = soup.select(selectors.img_logo)
|
42 |
+
logos.extend(soup.select(selectors.id_logo))
|
43 |
+
logos.extend(soup.select(selectors.cls_logo))
|
44 |
|
45 |
i = 0
|
46 |
lfn = []
|
47 |
for l in logos:
|
48 |
+
if 'src' in l.attrs:
|
49 |
+
src = l.attrs['src']
|
50 |
+
if not src.startswith('http'): src = e.url + src
|
51 |
+
lfn.append(get_img_logo(src))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
i+=1
|
53 |
return lfn
|