From b7227a265330f60cbc72ff5def3b7d10df800b7d Mon Sep 17 00:00:00 2001
From: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>
Date: Tue, 25 Mar 2025 18:50:01 -0400
Subject: [PATCH] Generate CSV to fix dataset viewer
generates metadata CSV that should make dataset viewer (at https://huggingface.co/datasets/imageomics/fish-vista/tree/a9464827ff5c608f4079d24d2c24b939b036aa4e) display properly if all configs are removed from the README.
Also generates intermediate CSVs for all subsets
---
code/meta_fix.ipynb | 527 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 527 insertions(+)
create mode 100644 code/meta_fix.ipynb
diff --git a/code/meta_fix.ipynb b/code/meta_fix.ipynb
new file mode 100644
index 0000000..2a36093
--- /dev/null
+++ b/code/meta_fix.ipynb
@@ -0,0 +1,527 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Fish-Vista Metadata Fix (for Dataset Viewer)\n",
+ "\n",
+ "Dataset: https://huggingface.co/datasets/imageomics/fish-vista"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classification_csvs = [\"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/classification_test.csv\",\n",
+ " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/classification_train.csv\",\n",
+ " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/classification_val.csv\"]\n",
+ "\n",
+ "identification_csvs = [\"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/identification_test_insp.csv\",\n",
+ " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/identification_test_lvsp.csv\",\n",
+ " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/identification_train.csv\",\n",
+ " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/identification_val.csv\"]\n",
+ "\n",
+ "segmentation_csvs = [\"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/segmentation_test.csv\",\n",
+ " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/segmentation_train.csv\",\n",
+ " \"https://huggingface.co/datasets/imageomics/fish-vista/resolve/a9464827ff5c608f4079d24d2c24b939b036aa4e/segmentation_val.csv\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " filename | \n",
+ " source_filename | \n",
+ " original_format | \n",
+ " arkid | \n",
+ " family | \n",
+ " source | \n",
+ " owner | \n",
+ " standardized_species | \n",
+ " original_url | \n",
+ " license | \n",
+ " adipose_fin | \n",
+ " pelvic_fin | \n",
+ " barbel | \n",
+ " multiple_dorsal_fin | \n",
+ " file_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " JFBM-FISH-0022553.jpg | \n",
+ " JFBM-FISH-0022553.jpg | \n",
+ " jpg | \n",
+ " fm82032s | \n",
+ " Ictaluridae | \n",
+ " GLIN | \n",
+ " JFBM | \n",
+ " noturus gyrinus | \n",
+ " https://fishair.org/hdr-share/ftp/ark/89609/GL... | \n",
+ " CC0 1.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Images/chunk_4/JFBM-FISH-0022553.jpg | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " ark-_65665_m3e9156fee2bb64a56b45bacbbab425d07.jpg | \n",
+ " ark-_65665_m3e9156fee2bb64a56b45bacbbab425d07.jpg | \n",
+ " jpg | \n",
+ " 9f40283k | \n",
+ " apogonidae | \n",
+ " iDigBio | \n",
+ " usnm | \n",
+ " fowleria vaiulae | \n",
+ " https://fishair.org/hdr-share/ftp/ark/89609/iD... | \n",
+ " Usage Conditions Apply | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Images/chunk_4/ark-_65665_m3e9156fee2bb64a56b4... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " INHS_FISH_61023.jpg | \n",
+ " INHS_FISH_61023.jpg | \n",
+ " jpg | \n",
+ " 0695rf52 | \n",
+ " Centrarchidae | \n",
+ " GLIN | \n",
+ " INHS | \n",
+ " lepomis cyanellus | \n",
+ " https://fishair.org/hdr-share/ftp/ark/89609/GL... | \n",
+ " CC BY-NC | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Images/chunk_4/INHS_FISH_61023.jpg | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 80544_lat_FMNH_FZ#6.jpg | \n",
+ " 80544_lat_FMNH_FZ.jpg | \n",
+ " jpg | \n",
+ " 9004rk30 | \n",
+ " Cyprinidae | \n",
+ " GLIN | \n",
+ " FMNH | \n",
+ " cyprinella lutrensis | \n",
+ " https://fishair.org/hdr-share/ftp/ark/89609/GL... | \n",
+ " CC BY-NC | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Images/chunk_4/80544_lat_FMNH_FZ#6.jpg | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " m3b534eebc-6ae8-4a4d-afb0-baa0fe87cfaa.jpg | \n",
+ " m3b534eebc-6ae8-4a4d-afb0-baa0fe87cfaa.jpg | \n",
+ " jpg | \n",
+ " s838tx45 | \n",
+ " gobiidae | \n",
+ " iDigBio | \n",
+ " usnm | \n",
+ " gobiidae | \n",
+ " https://fishair.org/hdr-share/ftp/ark/89609/iD... | \n",
+ " Usage Conditions Apply | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Images/chunk_4/m3b534eebc-6ae8-4a4d-afb0-baa0f... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " filename \\\n",
+ "0 JFBM-FISH-0022553.jpg \n",
+ "1 ark-_65665_m3e9156fee2bb64a56b45bacbbab425d07.jpg \n",
+ "2 INHS_FISH_61023.jpg \n",
+ "3 80544_lat_FMNH_FZ#6.jpg \n",
+ "4 m3b534eebc-6ae8-4a4d-afb0-baa0fe87cfaa.jpg \n",
+ "\n",
+ " source_filename original_format \\\n",
+ "0 JFBM-FISH-0022553.jpg jpg \n",
+ "1 ark-_65665_m3e9156fee2bb64a56b45bacbbab425d07.jpg jpg \n",
+ "2 INHS_FISH_61023.jpg jpg \n",
+ "3 80544_lat_FMNH_FZ.jpg jpg \n",
+ "4 m3b534eebc-6ae8-4a4d-afb0-baa0fe87cfaa.jpg jpg \n",
+ "\n",
+ " arkid family source owner standardized_species \\\n",
+ "0 fm82032s Ictaluridae GLIN JFBM noturus gyrinus \n",
+ "1 9f40283k apogonidae iDigBio usnm fowleria vaiulae \n",
+ "2 0695rf52 Centrarchidae GLIN INHS lepomis cyanellus \n",
+ "3 9004rk30 Cyprinidae GLIN FMNH cyprinella lutrensis \n",
+ "4 s838tx45 gobiidae iDigBio usnm gobiidae \n",
+ "\n",
+ " original_url license \\\n",
+ "0 https://fishair.org/hdr-share/ftp/ark/89609/GL... CC0 1.0 \n",
+ "1 https://fishair.org/hdr-share/ftp/ark/89609/iD... Usage Conditions Apply \n",
+ "2 https://fishair.org/hdr-share/ftp/ark/89609/GL... CC BY-NC \n",
+ "3 https://fishair.org/hdr-share/ftp/ark/89609/GL... CC BY-NC \n",
+ "4 https://fishair.org/hdr-share/ftp/ark/89609/iD... Usage Conditions Apply \n",
+ "\n",
+ " adipose_fin pelvic_fin barbel multiple_dorsal_fin \\\n",
+ "0 NaN NaN NaN NaN \n",
+ "1 NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN \n",
+ "\n",
+ " file_name \n",
+ "0 Images/chunk_4/JFBM-FISH-0022553.jpg \n",
+ "1 Images/chunk_4/ark-_65665_m3e9156fee2bb64a56b4... \n",
+ "2 Images/chunk_4/INHS_FISH_61023.jpg \n",
+ "3 Images/chunk_4/80544_lat_FMNH_FZ#6.jpg \n",
+ "4 Images/chunk_4/m3b534eebc-6ae8-4a4d-afb0-baa0f... "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "classification_df = pd.read_csv(classification_csvs[0])\n",
+ "classification_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classification_df[\"split\"] = \"test\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(7556, 16)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "classification_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def add_split(df, csv, split):\n",
+ " print(f\"Adding split {split}\")\n",
+ " temp_df = pd.read_csv(csv)\n",
+ " temp_df[\"split\"] = split\n",
+ " print(temp_df.shape)\n",
+ " combined_df = pd.concat([df, temp_df], ignore_index = True)\n",
+ " print(combined_df.shape)\n",
+ " return combined_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Adding split train\n",
+ "(35328, 16)\n",
+ "(42884, 16)\n",
+ "Adding split val\n",
+ "(4995, 16)\n",
+ "(47879, 16)\n"
+ ]
+ }
+ ],
+ "source": [
+ "class_df = add_split(classification_df, classification_csvs[1], \"train\")\n",
+ "classification_df = add_split(class_df, classification_csvs[2], \"val\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Save Classification Metadata File"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classification_df[\"subset\"] = \"species_classification\"\n",
+ "\n",
+ "classification_df.to_csv(\"meta-subsets/classification-metadata.csv\", index = False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Make Trait ID Metadata File"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(7771, 16)"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "id_df = pd.read_csv(identification_csvs[0])\n",
+ "id_df[\"split\"] = \"test_insp\"\n",
+ "\n",
+ "id_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Adding split test_lvsp\n",
+ "(1935, 16)\n",
+ "(9706, 16)\n",
+ "Adding split train\n",
+ "(38038, 16)\n",
+ "(47744, 16)\n",
+ "Adding split val\n",
+ "(5238, 16)\n",
+ "(52982, 16)\n"
+ ]
+ }
+ ],
+ "source": [
+ "id_temp = add_split(id_df, identification_csvs[1], \"test_lvsp\")\n",
+ "id_temp = add_split(id_temp, identification_csvs[2], \"train\")\n",
+ "id_df = add_split(id_temp, identification_csvs[3], \"val\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "id_df[\"subset\"] = \"species_trait_identification\"\n",
+ "\n",
+ "id_df.to_csv(\"meta-subsets/identification-metadata.csv\", index = False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Make Segmentation Metadata File"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(600, 16)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seg_df = pd.read_csv(segmentation_csvs[0])\n",
+ "seg_df[\"split\"] = \"test\"\n",
+ "\n",
+ "seg_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Adding split train\n",
+ "(1707, 16)\n",
+ "(2307, 16)\n",
+ "Adding split val\n",
+ "(120, 16)\n",
+ "(2427, 16)\n"
+ ]
+ }
+ ],
+ "source": [
+ "seg_temp = add_split(seg_df, segmentation_csvs[1], \"train\")\n",
+ "seg_df = add_split(seg_temp, segmentation_csvs[2], \"val\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seg_df[\"subset\"] = \"trait_segmentation\"\n",
+ "\n",
+ "seg_df.to_csv(\"meta-subsets/segmentation-metadata.csv\", index = False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Combine Subsets to Single Metadata File"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(100861, 17)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "(103288, 17)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "temp_combo = pd.concat([classification_df, id_df], ignore_index = True)\n",
+ "print(temp_combo.shape)\n",
+ "\n",
+ "combo_df = pd.concat([temp_combo, seg_df], ignore_index = True)\n",
+ "combo_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "combo_df.to_csv(\"meta-subsets/metadata.csv\", index = False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "std-polars",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}