Nicky Nicolson
commited on
Commit
·
5aa6463
1
Parent(s):
7f36417
Pass in working directory as arg to script
Browse files- Dockerfile +1 -1
- tab2csv.py +12 -6
Dockerfile
CHANGED
@@ -18,7 +18,7 @@ RUN ls -lh /data
|
|
18 |
COPY ./tab2csv.py /code/tab2csv.py
|
19 |
|
20 |
|
21 |
-
RUN python tab2csv.py --createcols ${GBIF_DOWNLOAD_ID}
|
22 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
23 |
RUN ls -l /code
|
24 |
RUN sqlite-utils tables /code/gbifocc.db --counts
|
|
|
18 |
COPY ./tab2csv.py /code/tab2csv.py
|
19 |
|
20 |
|
21 |
+
RUN python tab2csv.py --createcols /data ${GBIF_DOWNLOAD_ID} gbifocc.csv
|
22 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
23 |
RUN ls -l /code
|
24 |
RUN sqlite-utils tables /code/gbifocc.db --counts
|
tab2csv.py
CHANGED
@@ -66,27 +66,33 @@ def getGbifDownloadColumnNames(download_format):
|
|
66 |
|
67 |
if __name__ == '__main__':
|
68 |
parser = argparse.ArgumentParser()
|
|
|
69 |
parser.add_argument("download_id")
|
70 |
parser.add_argument("-c","--createcols", action='store_true')
|
71 |
parser.add_argument("-l","--limit", type=int)
|
72 |
-
parser.add_argument("
|
73 |
args = parser.parse_args()
|
74 |
|
75 |
# Determine format of datafile by accessing download metadata from GBIF API
|
76 |
gbif_metadata = occ.download_meta(key = args.download_id)
|
77 |
download_format = gbif_metadata['request']['format']
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
79 |
column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
|
80 |
column_names = None
|
81 |
if download_format == 'SIMPLE_CSV':
|
82 |
-
|
83 |
column_names = column_names_simple_csv
|
84 |
elif download_format == 'DWCA':
|
85 |
-
|
86 |
column_names_dwca = getGbifDownloadColumnNames('DWCA')
|
87 |
column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
|
88 |
|
89 |
-
df = pd.read_csv(os.path.join(
|
90 |
encoding='utf8',
|
91 |
keep_default_na=False,
|
92 |
on_bad_lines='skip',
|
@@ -103,4 +109,4 @@ if __name__ == '__main__':
|
|
103 |
# Add column holding collector name and number
|
104 |
mask = (df.recordNumber.notnull())
|
105 |
df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
|
106 |
-
df.to_csv(args.
|
|
|
66 |
|
67 |
if __name__ == '__main__':
|
68 |
parser = argparse.ArgumentParser()
|
69 |
+
parser.add_argument("data_dir")
|
70 |
parser.add_argument("download_id")
|
71 |
parser.add_argument("-c","--createcols", action='store_true')
|
72 |
parser.add_argument("-l","--limit", type=int)
|
73 |
+
parser.add_argument("outputfilename")
|
74 |
args = parser.parse_args()
|
75 |
|
76 |
# Determine format of datafile by accessing download metadata from GBIF API
|
77 |
gbif_metadata = occ.download_meta(key = args.download_id)
|
78 |
download_format = gbif_metadata['request']['format']
|
79 |
+
# The GBIF download format determines:
|
80 |
+
# (1) the columns in the download, SIMPLE_CSV being a much restricted set
|
81 |
+
# of columns than DWCA
|
82 |
+
# (2) The name of the occurrence data file, SIMPLE_CSV : '[download_id].csv'
|
83 |
+
# DWCA : 'occurrence.txt'
|
84 |
+
inputfilename = None
|
85 |
column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
|
86 |
column_names = None
|
87 |
if download_format == 'SIMPLE_CSV':
|
88 |
+
inputfilename = '{}.csv'.format(args.download_id)
|
89 |
column_names = column_names_simple_csv
|
90 |
elif download_format == 'DWCA':
|
91 |
+
inputfilename = 'occurrence.txt'
|
92 |
column_names_dwca = getGbifDownloadColumnNames('DWCA')
|
93 |
column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
|
94 |
|
95 |
+
df = pd.read_csv(os.path.join(args.data_dir,inputfilename),
|
96 |
encoding='utf8',
|
97 |
keep_default_na=False,
|
98 |
on_bad_lines='skip',
|
|
|
109 |
# Add column holding collector name and number
|
110 |
mask = (df.recordNumber.notnull())
|
111 |
df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
|
112 |
+
df.to_csv(os.path.join(args.data_dir,args.outputfilename), index=False, sep=',')
|