{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"\n",
"import pandas as pd\n",
"from tqdm import tqdm\n",
"from lavis.common.utils import get_abs_path, get_cache_path"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"cc3m = pd.read_csv(\"downloaded_cc3m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"caption a very typical bus station\n",
"path /export/home/.cache/lavis/conceptual_caption/i...\n",
"dataset cc3m\n",
"mimetype image/jpeg\n",
"size 36078\n",
"status 200\n",
"url http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n",
"Name: 0, dtype: object"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cc3m.iloc[0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3318333"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(cc3m)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 2759017 valid records\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"cnt = 0\n",
"\n",
"valid_records = []\n",
"\n",
"for i, path in tqdm(enumerate(cc3m.path.unique()), total=len(cc3m.path.unique())):\n",
" path = str(path)\n",
" if os.path.exists(path):\n",
" record = cc3m.iloc[i]\n",
" valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n",
"\n",
" cnt += 1\n",
"\n",
"print(\"Found {} valid records\".format(cnt))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2759017"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(valid_records)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'image': '/export/home/.cache/lavis/conceptual_caption/images/1_3239086386.jpg',\n",
" 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid_records[1]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/export/home/.cache/lavis/conceptual_caption/annotations/cc3m.json already exists\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details."
]
}
],
"source": [
"from omegaconf import OmegaConf\n",
"\n",
"\n",
"config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_3m.yaml\")\n",
"\n",
"ann_path = OmegaConf.load(\n",
" config_path\n",
").datasets.conceptual_caption_3m.build_info.annotations.train.storage[0]\n",
"\n",
"ann_path = get_cache_path(ann_path)\n",
"\n",
"if os.path.exists(ann_path):\n",
" # abort\n",
" print(\"{} already exists\".format(ann_path))\n",
"else:\n",
" # Save the valid records to a json file\n",
" with open(ann_path, \"w\") as f:\n",
" f.write(json.dumps(valid_records))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.10 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}