Nicky Nicolson
commited on
Commit
·
7f36417
1
Parent(s):
561ae23
Modifications to allow use of DWCA format download
Browse files- Dockerfile +1 -1
- tab2csv.py +35 -2
Dockerfile
CHANGED
@@ -18,7 +18,7 @@ RUN ls -lh /data
|
|
18 |
COPY ./tab2csv.py /code/tab2csv.py
|
19 |
|
20 |
|
21 |
-
RUN python tab2csv.py --createcols
|
22 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
23 |
RUN ls -l /code
|
24 |
RUN sqlite-utils tables /code/gbifocc.db --counts
|
|
|
18 |
COPY ./tab2csv.py /code/tab2csv.py
|
19 |
|
20 |
|
21 |
+
RUN python tab2csv.py --createcols ${GBIF_DOWNLOAD_ID} /data/gbifocc.csv
|
22 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
23 |
RUN ls -l /code
|
24 |
RUN sqlite-utils tables /code/gbifocc.db --counts
|
tab2csv.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
import argparse
|
2 |
import pandas as pd
|
3 |
import requests
|
|
|
4 |
from tqdm import tqdm
|
5 |
tqdm.pandas()
|
|
|
6 |
|
7 |
def getFirstFamilyName(recordedBy):
|
8 |
firstFamilyName = None
|
@@ -46,19 +48,50 @@ def getFirstFamilyNameBulk(df,
|
|
46 |
df[firstFamilyNameColName] = df[recordedByColName].map(results)
|
47 |
return df
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
if __name__ == '__main__':
|
50 |
parser = argparse.ArgumentParser()
|
51 |
-
parser.add_argument("
|
52 |
parser.add_argument("-c","--createcols", action='store_true')
|
53 |
parser.add_argument("-l","--limit", type=int)
|
54 |
parser.add_argument("outputfile")
|
55 |
args = parser.parse_args()
|
56 |
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
encoding='utf8',
|
59 |
keep_default_na=False,
|
60 |
on_bad_lines='skip',
|
61 |
sep='\t',
|
|
|
62 |
nrows=args.limit)
|
63 |
if args.createcols:
|
64 |
# Extract unique recordedBy values
|
|
|
1 |
import argparse
|
2 |
import pandas as pd
|
3 |
import requests
|
4 |
+
from pygbif import occurrences as occ
|
5 |
from tqdm import tqdm
|
6 |
tqdm.pandas()
|
7 |
+
import os.path
|
8 |
|
9 |
def getFirstFamilyName(recordedBy):
|
10 |
firstFamilyName = None
|
|
|
48 |
df[firstFamilyNameColName] = df[recordedByColName].map(results)
|
49 |
return df
|
50 |
|
51 |
+
GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV = 'https://api.gbif.org/v1/occurrence/download/describe/simpleCsv'
|
52 |
+
GBIF_DOWNLOAD_DESCRIBE_URL_DWCA = 'https://api.gbif.org/v1/occurrence/download/describe/dwca'
|
53 |
+
|
54 |
+
def getGbifDownloadColumnNames(download_format):
|
55 |
+
column_names = None
|
56 |
+
if download_format == 'SIMPLE_CSV':
|
57 |
+
r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV)
|
58 |
+
columns_metadata = r.json()
|
59 |
+
column_names = [column_metadata['name'] for column_metadata in columns_metadata['fields']]
|
60 |
+
elif download_format == 'DWCA':
|
61 |
+
r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_DWCA)
|
62 |
+
columns_metadata = r.json()
|
63 |
+
column_names = [column_metadata['name'] for column_metadata in columns_metadata['verbatim']['fields']]
|
64 |
+
return column_names
|
65 |
+
|
66 |
+
|
67 |
if __name__ == '__main__':
|
68 |
parser = argparse.ArgumentParser()
|
69 |
+
parser.add_argument("download_id")
|
70 |
parser.add_argument("-c","--createcols", action='store_true')
|
71 |
parser.add_argument("-l","--limit", type=int)
|
72 |
parser.add_argument("outputfile")
|
73 |
args = parser.parse_args()
|
74 |
|
75 |
+
# Determine format of datafile by accessing download metadata from GBIF API
|
76 |
+
gbif_metadata = occ.download_meta(key = args.download_id)
|
77 |
+
download_format = gbif_metadata['request']['format']
|
78 |
+
inputfile = None
|
79 |
+
column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
|
80 |
+
column_names = None
|
81 |
+
if download_format == 'SIMPLE_CSV':
|
82 |
+
inputfile = '{}.csv'.format(args.download_id)
|
83 |
+
column_names = column_names_simple_csv
|
84 |
+
elif download_format == 'DWCA':
|
85 |
+
inputfile = 'occurrence.txt'
|
86 |
+
column_names_dwca = getGbifDownloadColumnNames('DWCA')
|
87 |
+
column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
|
88 |
+
|
89 |
+
df = pd.read_csv(os.path.join('data',inputfile),
|
90 |
encoding='utf8',
|
91 |
keep_default_na=False,
|
92 |
on_bad_lines='skip',
|
93 |
sep='\t',
|
94 |
+
usecols=column_names,
|
95 |
nrows=args.limit)
|
96 |
if args.createcols:
|
97 |
# Extract unique recordedBy values
|