diff --git a/code/meta_fix.ipynb b/code/meta_fix.ipynb new file mode 100644 index 0000000..2a36093 --- /dev/null +++ b/code/meta_fix.ipynb @@ -0,0 +1,527 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fish-Vista Metadata Fix (for Dataset Viewer)\n", + "\n", + "Dataset: https://huggingface.co/datasets/imageomics/fish-vista" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "classification_csvs = [\"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/classification_test.csv\",\n", + " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/classification_train.csv\",\n", + " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/classification_val.csv\"]\n", + "\n", + "identification_csvs = [\"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/identification_test_insp.csv\",\n", + " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/identification_test_lvsp.csv\",\n", + " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/identification_train.csv\",\n", + " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/identification_val.csv\"]\n", + "\n", + "segmentation_csvs = [\"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/segmentation_test.csv\",\n", + " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/segmentation_train.csv\",\n", + " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/segmentation_val.csv\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamesource_filenameoriginal_formatarkidfamilysourceownerstandardized_speciesoriginal_urllicenseadipose_finpelvic_finbarbelmultiple_dorsal_finfile_name
0JFBM-FISH-0022553.jpgJFBM-FISH-0022553.jpgjpgfm82032sIctaluridaeGLINJFBMnoturus gyrinushttps://fishair.org/hdr-share/ftp/ark/89609/GL...CC0 1.0NaNNaNNaNNaNImages/chunk_4/JFBM-FISH-0022553.jpg
1ark-_65665_m3e9156fee2bb64a56b45bacbbab425d07.jpgark-_65665_m3e9156fee2bb64a56b45bacbbab425d07.jpgjpg9f40283kapogonidaeiDigBiousnmfowleria vaiulaehttps://fishair.org/hdr-share/ftp/ark/89609/iD...Usage Conditions ApplyNaNNaNNaNNaNImages/chunk_4/ark-_65665_m3e9156fee2bb64a56b4...
2INHS_FISH_61023.jpgINHS_FISH_61023.jpgjpg0695rf52CentrarchidaeGLININHSlepomis cyanellushttps://fishair.org/hdr-share/ftp/ark/89609/GL...CC BY-NCNaNNaNNaNNaNImages/chunk_4/INHS_FISH_61023.jpg
380544_lat_FMNH_FZ#6.jpg80544_lat_FMNH_FZ.jpgjpg9004rk30CyprinidaeGLINFMNHcyprinella lutrensishttps://fishair.org/hdr-share/ftp/ark/89609/GL...CC BY-NCNaNNaNNaNNaNImages/chunk_4/80544_lat_FMNH_FZ#6.jpg
4m3b534eebc-6ae8-4a4d-afb0-baa0fe87cfaa.jpgm3b534eebc-6ae8-4a4d-afb0-baa0fe87cfaa.jpgjpgs838tx45gobiidaeiDigBiousnmgobiidaehttps://fishair.org/hdr-share/ftp/ark/89609/iD...Usage Conditions ApplyNaNNaNNaNNaNImages/chunk_4/m3b534eebc-6ae8-4a4d-afb0-baa0f...
\n", + "
" + ], + "text/plain": [ + " filename \\\n", + "0 JFBM-FISH-0022553.jpg \n", + "1 ark-_65665_m3e9156fee2bb64a56b45bacbbab425d07.jpg \n", + "2 INHS_FISH_61023.jpg \n", + "3 80544_lat_FMNH_FZ#6.jpg \n", + "4 m3b534eebc-6ae8-4a4d-afb0-baa0fe87cfaa.jpg \n", + "\n", + " source_filename original_format \\\n", + "0 JFBM-FISH-0022553.jpg jpg \n", + "1 ark-_65665_m3e9156fee2bb64a56b45bacbbab425d07.jpg jpg \n", + "2 INHS_FISH_61023.jpg jpg \n", + "3 80544_lat_FMNH_FZ.jpg jpg \n", + "4 m3b534eebc-6ae8-4a4d-afb0-baa0fe87cfaa.jpg jpg \n", + "\n", + " arkid family source owner standardized_species \\\n", + "0 fm82032s Ictaluridae GLIN JFBM noturus gyrinus \n", + "1 9f40283k apogonidae iDigBio usnm fowleria vaiulae \n", + "2 0695rf52 Centrarchidae GLIN INHS lepomis cyanellus \n", + "3 9004rk30 Cyprinidae GLIN FMNH cyprinella lutrensis \n", + "4 s838tx45 gobiidae iDigBio usnm gobiidae \n", + "\n", + " original_url license \\\n", + "0 https://fishair.org/hdr-share/ftp/ark/89609/GL... CC0 1.0 \n", + "1 https://fishair.org/hdr-share/ftp/ark/89609/iD... Usage Conditions Apply \n", + "2 https://fishair.org/hdr-share/ftp/ark/89609/GL... CC BY-NC \n", + "3 https://fishair.org/hdr-share/ftp/ark/89609/GL... CC BY-NC \n", + "4 https://fishair.org/hdr-share/ftp/ark/89609/iD... Usage Conditions Apply \n", + "\n", + " adipose_fin pelvic_fin barbel multiple_dorsal_fin \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " file_name \n", + "0 Images/chunk_4/JFBM-FISH-0022553.jpg \n", + "1 Images/chunk_4/ark-_65665_m3e9156fee2bb64a56b4... \n", + "2 Images/chunk_4/INHS_FISH_61023.jpg \n", + "3 Images/chunk_4/80544_lat_FMNH_FZ#6.jpg \n", + "4 Images/chunk_4/m3b534eebc-6ae8-4a4d-afb0-baa0f... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classification_df = pd.read_csv(classification_csvs[0])\n", + "classification_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "classification_df[\"split\"] = \"test\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(7556, 16)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classification_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def add_split(df, csv, split):\n", + " print(f\"Adding split {split}\")\n", + " temp_df = pd.read_csv(csv)\n", + " temp_df[\"split\"] = split\n", + " print(temp_df.shape)\n", + " combined_df = pd.concat([df, temp_df], ignore_index = True)\n", + " print(combined_df.shape)\n", + " return combined_df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding split train\n", + "(35328, 16)\n", + "(42884, 16)\n", + "Adding split val\n", + "(4995, 16)\n", + "(47879, 16)\n" + ] + } + ], + "source": [ + "class_df = add_split(classification_df, classification_csvs[1], \"train\")\n", + "classification_df = add_split(class_df, classification_csvs[2], \"val\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save Classification Metadata File" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "classification_df[\"subset\"] = \"species_classification\"\n", + "\n", + "classification_df.to_csv(\"meta-subsets/classification-metadata.csv\", index = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make Trait ID Metadata File" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(7771, 16)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "id_df = pd.read_csv(identification_csvs[0])\n", + "id_df[\"split\"] = \"test_insp\"\n", + "\n", + "id_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding split test_lvsp\n", + "(1935, 16)\n", + "(9706, 16)\n", + "Adding split train\n", + "(38038, 16)\n", + "(47744, 16)\n", + "Adding split val\n", + "(5238, 16)\n", + "(52982, 16)\n" + ] + } + ], + "source": [ + "id_temp = add_split(id_df, identification_csvs[1], \"test_lvsp\")\n", + "id_temp = add_split(id_temp, identification_csvs[2], \"train\")\n", + "id_df = add_split(id_temp, identification_csvs[3], \"val\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "id_df[\"subset\"] = \"species_trait_identification\"\n", + "\n", + "id_df.to_csv(\"meta-subsets/identification-metadata.csv\", index = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make Segmentation Metadata File" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(600, 16)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seg_df = pd.read_csv(segmentation_csvs[0])\n", + "seg_df[\"split\"] = \"test\"\n", + "\n", + "seg_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding split train\n", + "(1707, 16)\n", + "(2307, 16)\n", + "Adding split val\n", + "(120, 16)\n", + "(2427, 16)\n" + ] + } + ], + "source": [ + "seg_temp = add_split(seg_df, segmentation_csvs[1], \"train\")\n", + "seg_df = add_split(seg_temp, segmentation_csvs[2], \"val\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "seg_df[\"subset\"] = \"trait_segmentation\"\n", + "\n", + "seg_df.to_csv(\"meta-subsets/segmentation-metadata.csv\", index = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Combine Subsets to Single Metadata File" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(100861, 17)\n" + ] + }, + { + "data": { + "text/plain": [ + "(103288, 17)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "temp_combo = pd.concat([classification_df, id_df], ignore_index = True)\n", + "print(temp_combo.shape)\n", + "\n", + "combo_df = pd.concat([temp_combo, seg_df], ignore_index = True)\n", + "combo_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "combo_df.to_csv(\"meta-subsets/metadata.csv\", index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "std-polars", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}