Niv Sardi commited on
Commit
485f76b
·
1 Parent(s): 05802f8

implement basic python crawler

Browse files

Signed-off-by: Niv Sardi <[email protected]>

README.org ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #+TITLE: Spoof Detect
2
+
3
+ * yolo
4
+ https://github.com/ModelDepot/tfjs-yolo-tiny
5
+ https://github.com/Hyuto/yolov5-tfjs
6
+
7
+ ** augmentation
8
+ https://github.com/srp-31/Data-Augmentation-for-Object-Detection-YOLO-
9
+
10
+
11
+ * proveedores
12
+ http://www.bcra.gov.ar/SistemasFinancierosYdePagos/Proveedores-servicios-de-pago-ofrecen-cuentas-de-pago.asp
13
+ http://www.bcra.gov.ar/SistemasFinancierosYdePagos/Proveedores-servicios-de-billeteras-digitales-Interoperables.asp
14
+
15
+ http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp
crawler/common/selectors.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ logo = "img[src*=logo]"
4
+ logosbancos = "img[src*=logosbancos]"
5
+
6
+ entity_http = "p.post-pagina-interior a[target=_blank][href*=http]"
7
+ entity_mailto = "p.post-pagina-interior a[target=_blank][href*=mailto]"
crawler/entity.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ class Entity():
4
+ def __init__(self, name, bco, url=None, logo=None):
5
+ self.name = name
6
+ self.bco = bco
7
+ self.url = url
8
+ self.logo = logo
9
+
10
+ def __repr__(self):
11
+ return f"""
12
+ Entity:
13
+ name: {self.name}
14
+ bco: {self.bco}
15
+ url: {self.url}
16
+ logo: {self.logo}
17
+ """
18
+
19
+ @classmethod
20
+ def from_list(cls, l):
21
+ self = apply(cls, l)
22
+ return self
23
+
24
+ @classmethod
25
+ def from_dict(cls, d):
26
+ self = cls(None, None)
27
+
28
+ for f in d.keys():
29
+ setattr(self, f, d[f])
30
+ return self
31
+
32
+ @classmethod
33
+ def row_names(cls):
34
+ return ['name', 'bco', 'url', 'logo']
35
+
36
+ @property
37
+ def DATA_PATH(self):
38
+ return f"./data/{self.bco}"
39
+
40
+ def to_row(self):
41
+ return [self.name, self.bco, self.url, self.logo]
42
+
43
+ if __name__ == '__main__':
44
+ e = Entity.from_dict({'url': 'blah'})
45
+ assert(e.url == 'blah')
46
+ print(e)
crawler/main.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from progress.bar import ChargingBar
6
+
7
+ from entity import Entity
8
+ from common import selectors
9
+
10
+ URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
11
+ page = requests.get(URL)
12
+
13
+ soup = BeautifulSoup(page.content, "html.parser")
14
+
15
+ options = soup.find(class_="form-control").find_all('option')
16
+ with open('entidades.csv', 'w', newline='') as csvfile:
17
+ writer = csv.writer(csvfile)
18
+ writer.writerow(Entity.row_names())
19
+
20
+ bar = ChargingBar('Processing', max=len(options))
21
+ for o in options[1:]:
22
+ e = Entity(
23
+ name = o.text,
24
+ bco = o.attrs['value']
25
+ )
26
+ page = requests.post(URL, data={'bco': e.bco})
27
+ soup = BeautifulSoup(page.content, "html.parser")
28
+ try:
29
+ img = soup.select_one(selectors.logosbancos).attrs['src']
30
+ img = img.replace("../", "https://www.bcra.gob.ar/")
31
+ except AttributeError as err:
32
+ print('img', e.name, err)
33
+ img = None
34
+ e.logo = img
35
+
36
+ a = soup.select_one(selectors.entity_http)
37
+ try:
38
+ a = a.attrs['href']
39
+ except AttributeError:
40
+ a = soup.select_one(selectors.entity_mailto)
41
+ try:
42
+ a = 'http://' + a.attrs['href'].split('@')[1]
43
+
44
+ except TypeError:
45
+ print('ERROR', a)
46
+
47
+ e.url = a
48
+ writer.writerow(e.to_row())
49
+ bar.next()
50
+ bar.finish()
crawler/screenshot.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ from selenium import webdriver
4
+ from selenium.webdriver.common.keys import Keys
5
+ from selenium.webdriver.common.by import By
6
+
7
+ from common import selectors
8
+ from entity import Entity
9
+
10
+ options = webdriver.FirefoxOptions()
11
+ options.add_argument("--headless")
12
+ options.add_argument("--window-size=1920x8000")
13
+
14
+ driver = webdriver.Firefox(options=options)
15
+ def sc_entity(e: Entity):
16
+ print(e)
17
+ driver.get(e.url)
18
+ driver.save_screenshot(f"{e.DATA_PATH}/screenshot.png")
19
+ driver.save_full_page_screenshot(f"{e.DATA_PATH}/screenshot.full.png")
20
+
21
+ logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
22
+ with open(f"{e.DATA_PATH}/logo.pos", 'w') as f:
23
+ for i in logos:
24
+ f.write(repr(i.rect))
25
+ print(i.get_attribute('src'), i.rect)
26
+
27
+ if __name__ == '__main__':
28
+ sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
crawler/vendor.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import pathlib
3
+ import ssl
4
+ import shutil
5
+ import csv
6
+ import concurrent.futures
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from progress.bar import ChargingBar
10
+
11
+ from entity import Entity
12
+ from common import selectors
13
+ import screenshot
14
+
15
+ def query_vendor_site(e: Entity):
16
+ pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
17
+
18
+ ssl_url = e.url.split("/")[2]
19
+ try:
20
+ page = requests.get(e.url)
21
+ except Exception:
22
+ page = requests.get(e.url.replace('http', 'https'))
23
+ soup = BeautifulSoup(page.content, "html.parser")
24
+
25
+ logos = soup.select(selectors.logo)
26
+ cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
27
+
28
+ fn = f"{e.DATA_PATH}/cert"
29
+ with open(fn, 'w') as f:
30
+ f.write(cert)
31
+ i = 0
32
+ lfn = []
33
+ for l in logos:
34
+ src = l.attrs['src']
35
+ ext = src.split('.')[-1].split('/')[-1]
36
+ try:
37
+ res = requests.get(src, stream=True)
38
+ except Exception:
39
+ res = requests.get(f"{e.url}/{src}")
40
+
41
+ fn = f"{e.DATA_PATH}/{i}.{ext}"
42
+ with open(fn, "wb") as f:
43
+ shutil.copyfileobj(res.raw, f)
44
+ lfn.append(fn)
45
+ i+=1
46
+ screenshot.sc_entity(e)
47
+ return (fn, lfn)
48
+
49
+ def from_csv(fn):
50
+ with open(fn, newline='') as csvfile:
51
+ reader = csv.DictReader(csvfile)
52
+ with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
53
+ futures = {executor.submit(query_vendor_site, e): e for e in [Entity.from_dict(d) for d in reader]}
54
+ bar = ChargingBar('Processing', max=len(futures))
55
+ for f in concurrent.futures.as_completed(futures):
56
+ url = futures[f]
57
+ try:
58
+ (cert, logos) = f.result()
59
+ except Exception as exc:
60
+ print('%r generated an exception: %s' % (url, exc))
61
+ else:
62
+ print(cert, logos)
63
+ bar.next()
64
+ bar.finish()
65
+
66
+ #query_vendor_site('http://www.bancoprovincia.com.ar', 'debug')
67
+ #exit()
68
+
69
+ if __name__ == '__main__':
70
+ from_csv('entidades.csv')
detect.js CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  // Set up a mutation observer to listen for title changes
2
  // Will fire if framework AJAX stuff switches page title
3
  let createObserver = function() {
@@ -8,7 +13,7 @@ let createObserver = function() {
8
  console.log('Mutations!', mutations)
9
  observer.disconnect()
10
  observer = null
11
- cleanup()
12
  createObserver()
13
  })
14
 
@@ -20,4 +25,4 @@ let createObserver = function() {
20
  createObserver()
21
 
22
  // Kick off initial page load check
23
- cleanup()
 
1
+ let run = () => {
2
+
3
+
4
+ }
5
+
6
  // Set up a mutation observer to listen for title changes
7
  // Will fire if framework AJAX stuff switches page title
8
  let createObserver = function() {
 
13
  console.log('Mutations!', mutations)
14
  observer.disconnect()
15
  observer = null
16
+ run()
17
  createObserver()
18
  })
19
 
 
25
  createObserver()
26
 
27
  // Kick off initial page load check
28
+ run()