Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
485f76b
1
Parent(s):
05802f8
implement basic python crawler
Browse filesSigned-off-by: Niv Sardi <[email protected]>
- README.org +15 -0
- crawler/common/selectors.py +7 -0
- crawler/entity.py +46 -0
- crawler/main.py +50 -0
- crawler/screenshot.py +28 -0
- crawler/vendor.py +70 -0
- detect.js +7 -2
README.org
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#+TITLE: Spoof Detect
|
2 |
+
|
3 |
+
* yolo
|
4 |
+
https://github.com/ModelDepot/tfjs-yolo-tiny
|
5 |
+
https://github.com/Hyuto/yolov5-tfjs
|
6 |
+
|
7 |
+
** augmentation
|
8 |
+
https://github.com/srp-31/Data-Augmentation-for-Object-Detection-YOLO-
|
9 |
+
|
10 |
+
|
11 |
+
* proveedores
|
12 |
+
http://www.bcra.gov.ar/SistemasFinancierosYdePagos/Proveedores-servicios-de-pago-ofrecen-cuentas-de-pago.asp
|
13 |
+
http://www.bcra.gov.ar/SistemasFinancierosYdePagos/Proveedores-servicios-de-billeteras-digitales-Interoperables.asp
|
14 |
+
|
15 |
+
http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp
|
crawler/common/selectors.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
logo = "img[src*=logo]"
|
4 |
+
logosbancos = "img[src*=logosbancos]"
|
5 |
+
|
6 |
+
entity_http = "p.post-pagina-interior a[target=_blank][href*=http]"
|
7 |
+
entity_mailto = "p.post-pagina-interior a[target=_blank][href*=mailto]"
|
crawler/entity.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
class Entity():
|
4 |
+
def __init__(self, name, bco, url=None, logo=None):
|
5 |
+
self.name = name
|
6 |
+
self.bco = bco
|
7 |
+
self.url = url
|
8 |
+
self.logo = logo
|
9 |
+
|
10 |
+
def __repr__(self):
|
11 |
+
return f"""
|
12 |
+
Entity:
|
13 |
+
name: {self.name}
|
14 |
+
bco: {self.bco}
|
15 |
+
url: {self.url}
|
16 |
+
logo: {self.logo}
|
17 |
+
"""
|
18 |
+
|
19 |
+
@classmethod
|
20 |
+
def from_list(cls, l):
|
21 |
+
self = apply(cls, l)
|
22 |
+
return self
|
23 |
+
|
24 |
+
@classmethod
|
25 |
+
def from_dict(cls, d):
|
26 |
+
self = cls(None, None)
|
27 |
+
|
28 |
+
for f in d.keys():
|
29 |
+
setattr(self, f, d[f])
|
30 |
+
return self
|
31 |
+
|
32 |
+
@classmethod
|
33 |
+
def row_names(cls):
|
34 |
+
return ['name', 'bco', 'url', 'logo']
|
35 |
+
|
36 |
+
@property
|
37 |
+
def DATA_PATH(self):
|
38 |
+
return f"./data/{self.bco}"
|
39 |
+
|
40 |
+
def to_row(self):
|
41 |
+
return [self.name, self.bco, self.url, self.logo]
|
42 |
+
|
43 |
+
if __name__ == '__main__':
|
44 |
+
e = Entity.from_dict({'url': 'blah'})
|
45 |
+
assert(e.url == 'blah')
|
46 |
+
print(e)
|
crawler/main.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
|
3 |
+
import requests
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from progress.bar import ChargingBar
|
6 |
+
|
7 |
+
from entity import Entity
|
8 |
+
from common import selectors
|
9 |
+
|
10 |
+
URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
|
11 |
+
page = requests.get(URL)
|
12 |
+
|
13 |
+
soup = BeautifulSoup(page.content, "html.parser")
|
14 |
+
|
15 |
+
options = soup.find(class_="form-control").find_all('option')
|
16 |
+
with open('entidades.csv', 'w', newline='') as csvfile:
|
17 |
+
writer = csv.writer(csvfile)
|
18 |
+
writer.writerow(Entity.row_names())
|
19 |
+
|
20 |
+
bar = ChargingBar('Processing', max=len(options))
|
21 |
+
for o in options[1:]:
|
22 |
+
e = Entity(
|
23 |
+
name = o.text,
|
24 |
+
bco = o.attrs['value']
|
25 |
+
)
|
26 |
+
page = requests.post(URL, data={'bco': e.bco})
|
27 |
+
soup = BeautifulSoup(page.content, "html.parser")
|
28 |
+
try:
|
29 |
+
img = soup.select_one(selectors.logosbancos).attrs['src']
|
30 |
+
img = img.replace("../", "https://www.bcra.gob.ar/")
|
31 |
+
except AttributeError as err:
|
32 |
+
print('img', e.name, err)
|
33 |
+
img = None
|
34 |
+
e.logo = img
|
35 |
+
|
36 |
+
a = soup.select_one(selectors.entity_http)
|
37 |
+
try:
|
38 |
+
a = a.attrs['href']
|
39 |
+
except AttributeError:
|
40 |
+
a = soup.select_one(selectors.entity_mailto)
|
41 |
+
try:
|
42 |
+
a = 'http://' + a.attrs['href'].split('@')[1]
|
43 |
+
|
44 |
+
except TypeError:
|
45 |
+
print('ERROR', a)
|
46 |
+
|
47 |
+
e.url = a
|
48 |
+
writer.writerow(e.to_row())
|
49 |
+
bar.next()
|
50 |
+
bar.finish()
|
crawler/screenshot.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
#
|
3 |
+
from selenium import webdriver
|
4 |
+
from selenium.webdriver.common.keys import Keys
|
5 |
+
from selenium.webdriver.common.by import By
|
6 |
+
|
7 |
+
from common import selectors
|
8 |
+
from entity import Entity
|
9 |
+
|
10 |
+
options = webdriver.FirefoxOptions()
|
11 |
+
options.add_argument("--headless")
|
12 |
+
options.add_argument("--window-size=1920x8000")
|
13 |
+
|
14 |
+
driver = webdriver.Firefox(options=options)
|
15 |
+
def sc_entity(e: Entity):
|
16 |
+
print(e)
|
17 |
+
driver.get(e.url)
|
18 |
+
driver.save_screenshot(f"{e.DATA_PATH}/screenshot.png")
|
19 |
+
driver.save_full_page_screenshot(f"{e.DATA_PATH}/screenshot.full.png")
|
20 |
+
|
21 |
+
logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
|
22 |
+
with open(f"{e.DATA_PATH}/logo.pos", 'w') as f:
|
23 |
+
for i in logos:
|
24 |
+
f.write(repr(i.rect))
|
25 |
+
print(i.get_attribute('src'), i.rect)
|
26 |
+
|
27 |
+
if __name__ == '__main__':
|
28 |
+
sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
|
crawler/vendor.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import pathlib
|
3 |
+
import ssl
|
4 |
+
import shutil
|
5 |
+
import csv
|
6 |
+
import concurrent.futures
|
7 |
+
import requests
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from progress.bar import ChargingBar
|
10 |
+
|
11 |
+
from entity import Entity
|
12 |
+
from common import selectors
|
13 |
+
import screenshot
|
14 |
+
|
15 |
+
def query_vendor_site(e: Entity):
|
16 |
+
pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
|
17 |
+
|
18 |
+
ssl_url = e.url.split("/")[2]
|
19 |
+
try:
|
20 |
+
page = requests.get(e.url)
|
21 |
+
except Exception:
|
22 |
+
page = requests.get(e.url.replace('http', 'https'))
|
23 |
+
soup = BeautifulSoup(page.content, "html.parser")
|
24 |
+
|
25 |
+
logos = soup.select(selectors.logo)
|
26 |
+
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
27 |
+
|
28 |
+
fn = f"{e.DATA_PATH}/cert"
|
29 |
+
with open(fn, 'w') as f:
|
30 |
+
f.write(cert)
|
31 |
+
i = 0
|
32 |
+
lfn = []
|
33 |
+
for l in logos:
|
34 |
+
src = l.attrs['src']
|
35 |
+
ext = src.split('.')[-1].split('/')[-1]
|
36 |
+
try:
|
37 |
+
res = requests.get(src, stream=True)
|
38 |
+
except Exception:
|
39 |
+
res = requests.get(f"{e.url}/{src}")
|
40 |
+
|
41 |
+
fn = f"{e.DATA_PATH}/{i}.{ext}"
|
42 |
+
with open(fn, "wb") as f:
|
43 |
+
shutil.copyfileobj(res.raw, f)
|
44 |
+
lfn.append(fn)
|
45 |
+
i+=1
|
46 |
+
screenshot.sc_entity(e)
|
47 |
+
return (fn, lfn)
|
48 |
+
|
49 |
+
def from_csv(fn):
|
50 |
+
with open(fn, newline='') as csvfile:
|
51 |
+
reader = csv.DictReader(csvfile)
|
52 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
|
53 |
+
futures = {executor.submit(query_vendor_site, e): e for e in [Entity.from_dict(d) for d in reader]}
|
54 |
+
bar = ChargingBar('Processing', max=len(futures))
|
55 |
+
for f in concurrent.futures.as_completed(futures):
|
56 |
+
url = futures[f]
|
57 |
+
try:
|
58 |
+
(cert, logos) = f.result()
|
59 |
+
except Exception as exc:
|
60 |
+
print('%r generated an exception: %s' % (url, exc))
|
61 |
+
else:
|
62 |
+
print(cert, logos)
|
63 |
+
bar.next()
|
64 |
+
bar.finish()
|
65 |
+
|
66 |
+
#query_vendor_site('http://www.bancoprovincia.com.ar', 'debug')
|
67 |
+
#exit()
|
68 |
+
|
69 |
+
if __name__ == '__main__':
|
70 |
+
from_csv('entidades.csv')
|
detect.js
CHANGED
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
// Set up a mutation observer to listen for title changes
|
2 |
// Will fire if framework AJAX stuff switches page title
|
3 |
let createObserver = function() {
|
@@ -8,7 +13,7 @@ let createObserver = function() {
|
|
8 |
console.log('Mutations!', mutations)
|
9 |
observer.disconnect()
|
10 |
observer = null
|
11 |
-
|
12 |
createObserver()
|
13 |
})
|
14 |
|
@@ -20,4 +25,4 @@ let createObserver = function() {
|
|
20 |
createObserver()
|
21 |
|
22 |
// Kick off initial page load check
|
23 |
-
|
|
|
1 |
+
let run = () => {
|
2 |
+
|
3 |
+
|
4 |
+
}
|
5 |
+
|
6 |
// Set up a mutation observer to listen for title changes
|
7 |
// Will fire if framework AJAX stuff switches page title
|
8 |
let createObserver = function() {
|
|
|
13 |
console.log('Mutations!', mutations)
|
14 |
observer.disconnect()
|
15 |
observer = null
|
16 |
+
run()
|
17 |
createObserver()
|
18 |
})
|
19 |
|
|
|
25 |
createObserver()
|
26 |
|
27 |
// Kick off initial page load check
|
28 |
+
run()
|