diff --git a/.gitignore b/.gitignore
index 23b99e089..b7e889daa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,11 @@
+.DS_Store
__pycache__/
+biblio_parsed.txt
+pubmed.txt
+pubmed_data.csv
+pubmed_data.csv
bibliovenv/
Bibenv/
-.idea/
\ No newline at end of file
+.idea/
+.venv
+data/
diff --git a/app.py b/app.py
index f0891f894..350da386c 100644
--- a/app.py
+++ b/app.py
@@ -45,13 +45,13 @@
# -----
# Author: PRAISELab Team
-
# Import necessary libraries for better performance - avoid importing everything
import tempfile
import os
import requests
import functools
from datetime import datetime
+import traceback
import pandas as pd
import io
from functions import *
@@ -64,6 +64,33 @@
from shinywidgets import render_widget
from shiny.express import ui, input, render
+## pubmed API imports
+from www.services.pubmed_api.api_service import search_pubmed, fetch_pubmed
+from www.services.pubmed_api.pagination import create_df_from_pubmed_data, get_paginated_papers_df
+from www.services.pubmed_api.xml_parser import get_data_from_query
+
+## openalex API imports
+from www.services.openalex.api_service import search_open_alex
+from www.services.openalex.parser import transform_from_open_alex
+from www.services.openalex.pagination import *
+from www.services.openalex.utils import *
+
+# JS Snippet to fix caching issue auto loading the last value of "select"
+ui.tags.script("""
+document.addEventListener('DOMContentLoaded', function() {
+ var sel = document.getElementById('select');
+ if (sel) {
+ sel.setAttribute('autocomplete', 'off');
+ // Force back to the intended default on every fresh load
+ sel.value = '0';
+ // Notify Shiny's reactive system the value changed (in case it was already '0' silently)
+ var event = new Event('change', { bubbles: true });
+ sel.dispatchEvent(event);
+ }
+});
+""")
+
+
# Setup the Directory for static assets - optimized for performance
base_dir = tempfile.gettempdir() # Use system temp dir instead of creating new temp file
express.app_opts(static_assets=base_dir, debug=False)
@@ -83,6 +110,11 @@
# Include custom CSS for the app's appearance.
ui.include_css("www/static/biblioshiny.css")
+
+start_trigger = reactive.Value(0)
+api_source = reactive.Value(None)
+df = reactive.Value(None)
+
# --- Header ---
# The header bar contains the logo, app name, and a set of dropdown menus for notifications, help, donations, and credits.
with ui.tags.div(class_="header-bar"):
@@ -575,7 +607,6 @@ def get_latest_cran_version():
ui.h3("📊 Data Management", style="color: #5567BB;")
ui.p("Easily import, load, or export your dataset.")
# ---------- INITIALIZE VARIABLES ----------
- df = reactive.Value(None)
# Optimized function to reset analysis results when dataset changes
def reset_all_analyses():
@@ -641,17 +672,20 @@ def create_loading_modal(analysis_type="analysis"):
with ui.layout_sidebar(fillable=False, fill=False):
# Sidebar for data import options
with ui.sidebar(id="sidebar_load_data", position="right" ):
- # Section for Import or Load
+ # Section for Import or Load
ui.h5("Data Import Options", style="color: #5567BB;")
ui.input_select(
- "select",
- "Choose an action:",
- {
- "": "-",
+ id= "select",
+ label="Choose an action:",
+ choices={
+ "0": "-",
"1A": "Import raw data file(s)",
"1B": "Load Bibliometrix file(s)",
- "1C": "Use a sample dataset"
+ "1C": "Use a sample dataset",
+ "1D": "API Import",
+ "1E": "Load locally saved API results"
},
+ selected="0"
)
@render.express()
@@ -711,7 +745,19 @@ def select_db():
ui.input_action_button("start_button", "Start", icon=ICONS["play"])
ui.markdown("Select a predefined sample dataset for testing purposes.")
- else:
+ elif input.select() == "1D":
+ ui.p("Redirecting to the API Import tab.", style="color: gray;")
+
+ elif input.select() == "1E":
+ ui.input_file(
+ "parquet_dataset",
+ "Choose a File",
+ accept=[".pqt"]
+ )
+ ui.p("Load a saved parquet file from data folder", style="color: gray; font-size: 10px; margin-top: -20px;")
+ ui.input_action_button("load_parquet_file", "Start", icon=ICONS["play"])
+
+ else:
ui.p("Please select a valid action to begin managing your data.", style="color: gray;")
ui.p("Follow the instructions below to manage your data efficiently:")
ui.markdown(
@@ -738,30 +784,92 @@ def select_db():
# )
# ui.input_action_button("export_button", "Export", icon=ICONS["download"], disabled=True)
- @render.express()
+ @reactive.effect
+ @reactive.event(input.select)
+ def _redirect_to_api():
+ if input.select() == "1D":
+ ui.update_navs("hidden_tabs", selected="API")
+
+ @reactive.effect
+ @reactive.event(input.load_parquet_file)
+ def load_local_parquet_file():
+ logging.debug('App.py - Invoked "load_local_parquet_file()"')
+ if input.select() == "1E":
+ logging.debug('App.py - invoked "load_local_parquet_file()"')
+ logging.debug(input.parquet_dataset())
+ df_from_disk = load_df_from_parquet(filepath=input.parquet_dataset()[0]["datapath"])
+ df.set(df_from_disk)
+
+ # Trigger for opening analysis page
+ if start_trigger.get() == 0:
+ start_trigger.set(1)
+
+ ui.update_navs(id='hidden_tabs', selected='import')
+
+
+ @reactive.effect
+ @reactive.event(input.select)
+ def _ ():
+ logging.debug(f'App.py - Select value: "{input.select()}"')
+
+ @reactive.effect
@reactive.event(input.start_button)
+ def _trigger_from_file():
+ logging.debug('App.py - Invoked "_trigger_from_file()"')
+ start_trigger.set(1)
+ #ui.update_navs(id='hidden_tabs', selected='import')
+
+ @render.express()
def mostra():
- database = get_database(input)
+ logging.debug('App.py - Invoked mostra() function')
+ trigger = start_trigger.get()
+ if trigger == 0:
+ return
+ logging.debug(f'Mostra function - trigger value: "{trigger}"')
+
+ ui.HTML(init_itables())
+
+
+ source = api_source.get()
+ if source is not None:
+ database = "PubMed" if source == "pubmed" else "OpenAlex"
+ #api_source.set(None) # reset per la prossima volta
+ else:
+ database = get_database(input)
+ # database = get_database(input)
ui.update_sidebar("sidebar_load_data", show=False)
ui.update_action_button("export_button", disabled=False)
ui.markdown(f"
Data of {database}
")
- if database == "Sample":
- data = df.set(pd.read_excel("sources/samples/sample.xlsx"))
- reset_all_analyses() # Reset analysis results when sample is loaded
-
- @render.express()
- @reactive.event(input.Dataset)
- def show_data():
- text = get_data(input, database, df, reset_all_analyses)
- text
- ui.HTML(init_itables())
-
- @render.ui
- @reactive.event(input.start_button)
- def show_table():
+ logging.debug('Invoked show_table function...')
+ table_ui = ui.h5("No data available.")
+ try:
table_ui, _, _ = get_table(database, df)
- return table_ui
+ except Exception as e:
+ logging.error(e, exc_info=True)
+ table_ui
+
+ # if database == "Sample":
+ # data = df.set(pd.read_excel("sources/samples/sample.xlsx"))
+ # reset_all_analyses() # Reset analysis results when sample is loaded
+
+ # @render.express()
+ # @reactive.event(input.Dataset)
+ # def show_data():
+ # text = get_data(input, database, df, reset_all_analyses)
+ # #text
+
+
+ # @render.ui
+ # @reactive.event(lambda: start_trigger.get())
+ # def show_table():
+ # logging.debug('Invoked show_table function...')
+ # table_ui = ui.h5("No data available.")
+ # try:
+ # table_ui, _, _ = get_table(database, df)
+ # except Exception as e:
+ # logging.error(e, exc_info=True)
+ # return table_ui
# -------- ADVICE BUTTON --------
@render.ui
@@ -853,9 +961,321 @@ def indicator_types_ui_all():
"""
),
- with ui.nav_panel("None", value="API"):
- ui.h3("🚧 Warning: API is under construction 🚧")
+ ## API SECTION ##
+ PAGINATION_LIMIT = 10000
+
+ current_query = reactive.Value(None)
+ works_per_page = reactive.Value(20)
+ search_results = reactive.Value(None)
+ full_results = reactive.Value(None)
+ max_number_pages = reactive.Value(None)
+ web_env_store = reactive.Value(None)
+ query_key_store = reactive.Value(1)
+ current_page = reactive.Value(0)
+ is_pubmed = reactive.Value(True)
+ page_limit = reactive.Value(None)
+ go_to_page_button_flag = reactive.Value(True)
+
+ with ui.nav_panel("None", value="API"):
+
+ ui.h3("PubMed or OpenAlex API", style="color: #5567BB;")
+ ui.input_select(
+ "api_select",
+ "Choose an API to import data from:",
+ {
+ "": "-",
+ "pubmed": "PubMed API",
+ "openalex": "OpenAlex API"
+ },
+ )
+
+ @render.express()
+ def render_api_selection():
+ if input.api_select() == "pubmed":
+ ui.h4("PubMed Search", style="color: #5567BB; margin-top: 20px;")
+ ui.p("Import data directly from PubMed using their API.", style="color: gray;")
+ with ui.layout_column_wrap():
+ with ui.card():
+ ui.input_text("pubmed_query", "Enter PubMed search query (e.g. 'cancer or genomics...')")
+ with ui.card():
+ ui.input_numeric(id="pubmed_works_per_page", label="Select the amounts of works per page: ", min=1, max=100, value=works_per_page.get())
+
+ @render.text
+ def pubmed_works_per_page_value():
+ value = input.pubmed_works_per_page()
+ if value < 1 or value > 100 or not isinstance(value, int):
+ return 'Invalid number. Please choose an integer between 1 and 100'
+
+ works_per_page.set(value)
+ return f'Selected value: {value}'
+
+ ui.input_action_button("handle_pubmed_search", "Search PubMed", class_="btn-primary")
+
+ elif input.api_select() == "openalex":
+ is_pubmed.set(False)
+ ui.h4("OpenAlex Search", style="color: #5567BB; margin-top: 20px;")
+ ui.p("Import data directly from OpenAlex using their API.", style="color: gray;")
+ with ui.layout_column_wrap():
+ with ui.card():
+ ui.input_text("openalex_query", "Enter OpenAlex search query (e.g. 'cancer or genomics...')")
+
+ with ui.card():
+ ui.input_numeric(id="openalex_works_per_page", label="Select the amounts of works per page: ", min=1, max=100, value=works_per_page.get())
+
+ @render.text
+ def openalex_works_per_page_value():
+ value = input.openalex_works_per_page()
+ if value < 1 or value > 100 or not isinstance(value, int):
+ return 'Invalid number. Please choose an integer between 1 and 100'
+
+ works_per_page.set(value)
+ return f'Selected value: {value}'
+
+ ui.input_action_button("handle_openalex_search", "Search OpenAlex", class_="btn-primary")
+
+ results = search_results()
+ if results is not None:
+ max_pages = max_number_pages()
+ page = current_page()
+ page_limit_value = page_limit.get()
+
+ ui.h4(f"Search Results for '{current_query.get()}':", style="color: #5567BB; margin-top: 20px;")
+ ui.h5(f"Total Number of pages: {max_pages} | Works per page: {works_per_page.get()}", style="color: gray; margin-bottom: 20px;")
+ ui.h5(f"Current page: {page}", style="color: gray; margin-bottom: 20px;")
+
+ with ui.layout_column_wrap():
+ with ui.card(): # (optional) navigating to the page
+ ui.input_numeric('page_number', f"Go to page: (Max {page_limit_value})", min=1, max=page_limit_value, value=int(page))
+
+ @render.text
+ def page_number_value():
+ value = input.page_number()
+ if not isinstance(value, int) or value < 1 or value > page_limit_value:
+ go_to_page_button_flag.set(True)
+ return f'Invalid number. Please choose an integer between 1 and {page_limit_value}'
+
+ go_to_page_button_flag.set(False)
+ return f'Selected value: {value}'
+
+ #ui.input_text("page_number", f"Go to page: (Max {page_limit.get()})", value=str(page))
+ ui.input_action_button("go_to_page", "Go", class_="btn-secondary", disabled=go_to_page_button_flag.get())
+
+ with ui.card():
+ ui.input_action_button("prev_page", "← Previous", disabled=(page <= 1))
+ ui.input_action_button("next_page", "Next →", disabled=(page >= page_limit.get()))
+
+ @render.data_frame
+ def results_df():
+ df = search_results()
+ if df is None:
+ return render.DataGrid(
+ pd.DataFrame())
+
+ results_show = df.copy()
+
+ return render.DataGrid(
+ results_show,
+ filters=True,
+ width="100%",
+ summary=False,
+ styles=[
+ {
+ "style": {
+ "vertical-align": "top",
+ "width": "150px",
+ "min-width": "150px",
+ "max-width": "150px",
+ "overflow": "hidden",
+ "text-overflow": "ellipsis",
+ "white-space": "nowrap",
+ }
+ }
+ ]
+ )
+
+ ui.h4("Start Analysis", style="color: #5567BB; margin-top: 30px;")
+ ui.input_action_button("api_start_button", "Start Analysis", icon=ICONS["play"])
+ ui.input_action_button("api_save_results_to_file", "Save current page results", icon=ICONS["save"])
+
+
+ @reactive.effect
+ @reactive.event(input.handle_pubmed_search)
+ def _handle_pubmed_search():
+ is_pubmed.set(True)
+ try:
+ query = input.pubmed_query()
+ # count = input.pubmed_count()
+
+ if not query or query.strip() == "":
+ ui.notification_show("Please enter a search query", type="error")
+ return
+
+ current_query.set(query)
+
+ ui.notification_show(f"Searching PubMed for: {query}")
+ query_result = search_pubmed(query)
+ retmax = works_per_page.get()
+ max_pages, webEnv, query_key = get_data_from_query(query_result, retmax)
+ print(f"Max pages: {max_pages}, WebEnv: {webEnv}")
+ max_number_pages.set(max_pages)
+ web_env_store.set(webEnv)
+ query_key_store.set(query_key)
+ current_page.set(1)
+
+ computed_limit = compute_page_limit()
+ page_limit.set(computed_limit)
+
+ first_page = get_paginated_papers_df(webEnv, query_key, page=0, retmax = retmax)
+ search_results.set(first_page)
+
+
+ except Exception as e:
+ logging.exception('pubmed search error')
+ ui.notification_show(f"Error: {str(e)}", type="error")
+
+ def compute_page_limit():
+ computed_limit = max_number_pages.get()
+ if max_number_pages.get() >= PAGINATION_LIMIT:
+ computed_limit = PAGINATION_LIMIT // works_per_page.get()
+ return computed_limit
+
+
+ @reactive.effect
+ @reactive.event(input.handle_openalex_search)
+ def _handle_openalex_search():
+ is_pubmed.set(False)
+ try:
+ query = input.openalex_query().lower().strip()
+ if not query or query.strip() == "":
+ ui.notification_show("Please enter a search query", type="error")
+ return
+
+ current_query.set(query)
+
+ ui.notification_show(f"Searching OpenAlex for: {query}")
+
+ #First search, get metadata to update max_pages and display first page
+ query_result = search_open_alex(query_str=query, works_per_page=works_per_page.get())
+ max_number_pages.set(
+ get_max_pages(query_result)
+ )
+ page_limit.set(PAGINATION_LIMIT // works_per_page.get())
+ current_page.set(1)
+
+ first_page = transform_to_df(query_result)
+ search_results.set(first_page)
+
+
+ # max_pages = get_data_from_query_open_alex(query)
+ # max_number_pages.set(max_pages)
+
+ #print("Max pages for OpenAlex query:", max_pages)
+ #current_page.set(1)
+
+ # all_df = search_open_alex(query, per_page=None)
+ # full_results.set(all_df)
+
+ # first_page = paginated_papers_open_alex(query, page=1)
+ # search_results.set(first_page)
+
+ except Exception as e:
+ ui.notification_show(f"Error: {str(e)}", type="error")
+
+ @reactive.effect
+ @reactive.event(input.prev_page)
+ def _go_prev():
+ page = current_page.get()
+ if page > 1:
+ new = page - 1
+ current_page.set(new)
+ if is_pubmed.get():
+ web_env = web_env_store.get()
+ query_key = query_key_store.get()
+ retmax = works_per_page.get()
+ search_results.set(get_paginated_papers_df(web_env, query_key, page=new, retmax=retmax))
+ else:
+ query_results = search_open_alex(query_str=input.openalex_query(), works_per_page=works_per_page.get(), current_page=new)
+ search_results.set(
+ transform_to_df(query_results)
+ )
+
+ @reactive.effect
+ @reactive.event(input.next_page)
+ def _go_next():
+ page = current_page.get()
+ max_pages = max_number_pages.get()
+ if page < max_pages - 1:
+ new = page + 1
+ current_page.set(new)
+ if is_pubmed.get():
+ web_env = web_env_store.get()
+ query_key = query_key_store.get()
+ retmax = works_per_page.get()
+ search_results.set(get_paginated_papers_df(web_env, query_key, page=new, retmax=retmax))
+ else:
+ query_results = search_open_alex(query_str=input.openalex_query(), works_per_page=works_per_page.get(), current_page=new)
+ search_results.set(
+ transform_to_df(query_results)
+ )
+
+ @reactive.effect
+ @reactive.event(input.go_to_page)
+ def _go_to_page():
+ val = input.page_number()
+
+ if isinstance(val, int):
+
+ current_page.set(val)
+
+ if is_pubmed.get():
+ web_env = web_env_store.get()
+ query_key = query_key_store.get()
+ retmax = works_per_page.get()
+ search_results.set(get_paginated_papers_df(web_env, query_key, page=val, retmax=retmax))
+ else:
+ query_results = search_open_alex(query_str=input.openalex_query(), works_per_page=works_per_page.get(), current_page=val)
+ search_results.set(
+ transform_to_df(query_results)
+ )
+
+
+ @reactive.effect
+ @reactive.event(input.api_start_button)
+ def _api_start():
+ logging.debug('App.py - Invoked "_api_start()"')
+ api_source.set(input.api_select())
+ df.set(search_results.get())
+ print(df.get())
+
+ # Trigger for opening analysis page
+ if start_trigger.get() == 0:
+ start_trigger.set(1)
+
+ ui.notification_show("Analysis done. See Main Information tab from the Overview side bar.")
+
+ ui.update_navs(id='hidden_tabs', selected='import')
+
+ @reactive.effect
+ @reactive.event(input.api_save_results_to_file)
+ def api_save_results_to_file():
+ logging.debug('Invoked "api_save_results_to_file()"')
+ df.set(search_results.get())
+ logging.debug(f'Results to save:\n{df.get()}\n\n')
+
+ # for col, dtype in df.get().dtypes.items():
+ # logging.debug(f"Column: {col:<20} Type: {dtype}")
+
+ query_text = current_query.get()
+ if isinstance(query_text, str) and query_text.strip() != "":
+ status = save_api_results_to_file(folder_path=r'.\data', file_name=f'{query_text} search results.pqt', df_to_save=df.get())
+ logging.debug(f'app.py - status value: "{status}"')
+
+ if not status:
+ ui.notification_show("Something went wrong .", type='error')
+ else:
+ ui.notification_show(f'Local file updated: "{query_text} search results.pqt"')
+
with ui.nav_panel("None", value="collections"):
ui.h3("🚧 Warning: Merge Collection is under construction 🚧")
@@ -8185,8 +8605,10 @@ def update_plot_settings():
# --- Sidebar Management ---
@render.express()
-@reactive.event(input.start_button)
def toggle_sidebar():
+ trigger = start_trigger.get()
+ if trigger == 0:
+ return
with ui.tags.div(id="sidebar_2", class_="custom-sidebar"):
with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False):
# Info Section
diff --git a/functions/__init__.py b/functions/__init__.py
index 20e24de36..f0107e8c5 100644
--- a/functions/__init__.py
+++ b/functions/__init__.py
@@ -40,4 +40,6 @@
from .get_thematicevolution import *
from .get_cocitation import *
from .get_collaborationnetwork import *
-from .get_worldmapcollaboration import *
\ No newline at end of file
+from .get_worldmapcollaboration import *
+from .save_api_results_to_file import *
+from .load_df_from_parquet import *
\ No newline at end of file
diff --git a/functions/get_database.py b/functions/get_database.py
index 5c5d4edc5..7140ee96c 100644
--- a/functions/get_database.py
+++ b/functions/get_database.py
@@ -1,3 +1,4 @@
+import logging
from www.services import *
@@ -11,27 +12,41 @@ def get_database(input):
Returns:
A string representing the name of the database.
"""
- if input.select() == "1A": # Bibliographic databases
+ database = ''
+ try:
+ if input.select() == "1A": # Bibliographic databases
+
+ database = input.database()
+
+ if database == "wos":
+ database = "Web of Science"
+ elif database == "scopus":
+ database = "Scopus"
+ elif database == "dimensions":
+ database = "Dimensions"
+ elif database == "lens":
+ database = "Lens.org"
+ elif database == "pubmed":
+ database = "PubMed"
+ elif database == "cochrane":
+ database = "Cochrane Library"
- database = input.database()
+ elif input.select() == "1B": # Bibliometrix database
+ database = "Bibliometrix"
- if database == "wos":
- database = "Web of Science"
- elif database == "scopus":
- database = "Scopus"
- elif database == "dimensions":
- database = "Dimensions"
- elif database == "lens":
- database = "Lens.org"
- elif database == "pubmed":
- database = "PubMed"
- elif database == "cochrane":
- database = "Cochrane Library"
-
- elif input.select() == "1B": # Bibliometrix database
- database = "Bibliometrix"
-
- elif input.select() == "1C": # Sample database
- database = "Sample"
-
+ elif input.select() == "1C": # Sample database
+ database = "Sample"
+
+ elif input.select() == "1E":
+ database = "Local parquet file"
+
+ elif input.api_select() in ['pubmed', 'openalex']:
+ database_name = input.api_select()
+ if database_name == 'pubmed':
+ database = 'PubMed'
+ if database_name == 'openalex':
+ database = 'OpenAlex'
+ except Exception as e:
+ logging.error(f'Error: \n{e}\n\n')
+
return database
diff --git a/functions/get_localcitedsources.py b/functions/get_localcitedsources.py
index 74b261455..f188724a9 100644
--- a/functions/get_localcitedsources.py
+++ b/functions/get_localcitedsources.py
@@ -1,5 +1,7 @@
from www.services import *
-
+import numpy as np
+import logging
+import pandas as pd
def get_local_cited_sources(df, num_of_cited_sources):
"""
@@ -99,6 +101,9 @@ def wrap_label(label, width=50):
# Set x-axis ticks to 0, 50, 100, etc.
max_x = source_counts["N. of Local Citations"].max()
+ logging.debug(f'get_localCitedSources.py - max_x value= "{max_x}", max_x type: "{type(max_x)}"')
+ if pd.isna(max_x):
+ max_x = 10
tick_step = 50
x_ticks = list(range(0, int(max_x) + tick_step, tick_step))
if x_ticks[-1] < max_x:
diff --git a/functions/get_table.py b/functions/get_table.py
index 75b9c91d8..34293996e 100644
--- a/functions/get_table.py
+++ b/functions/get_table.py
@@ -1,5 +1,6 @@
from www.services import *
from functions.get_status import *
+import logging
# Function to create a Plotly table visualization for metadata completeness
@@ -78,8 +79,11 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
Returns:
A DataTable object if data is available, otherwise a message indicating no data.
"""
+ logging.debug('get_table - Invoked "get_table()"')
# Retrieve the data from the DataFrame
data = df.get()
+ logging.debug(f'data: \n{data}')
+
table_html = ""
fig = None
@@ -125,8 +129,17 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
}
# Count missing values (NaN), empty strings, and empty lists in each column
- missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + (
- data.map(lambda x: x == [])).sum()
+ def is_empty_string(x):
+ # If it's a list, numpy array, or dictionary, it's not an empty string
+ if hasattr(x, '__len__') and not isinstance(x, (str, bytes)):
+ return False
+ return x in ["", " "]
+
+ is_missing = data.isna() | data.map(is_empty_string)
+ missing_counts = is_missing.sum()
+
+ # missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + (
+ # data.map(lambda x: x == [])).sum()
# Calculate the percentage of missing values for each column
missing_percentage = (missing_counts / total_rows) * 100
@@ -149,6 +162,9 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
# Create and return the Plotly table
fig = create_plotly_table(sorted_columns, dpi)
+ logging.debug('get_table - FIG OBJECT')
+ logging.debug(fig)
+
# HTML table header
table_header = """
diff --git a/functions/load_df_from_parquet.py b/functions/load_df_from_parquet.py
new file mode 100644
index 000000000..d32b3f34d
--- /dev/null
+++ b/functions/load_df_from_parquet.py
@@ -0,0 +1,22 @@
+import pandas as pd
+import logging
+
+def _clean_numpy_collections(df: pd.DataFrame) -> pd.DataFrame:
+ """Converts any embedded numpy arrays inside object columns back to native python lists."""
+ import numpy as np
+
+ # Target only 'object' columns which hold strings, lists, or arrays
+ object_cols = df.select_dtypes(include=['object']).columns
+
+ for col in object_cols:
+ # Check if the column contains any numpy arrays
+ if df[col].apply(lambda x: isinstance(x, np.ndarray)).any():
+ df[col] = df[col].apply(lambda x: list(x) if isinstance(x, np.ndarray) else x)
+
+ return df
+
+def load_df_from_parquet(filepath):
+ df_local = pd.read_parquet(filepath)
+ df_local = _clean_numpy_collections(df_local)
+ logging.debug(f'load_df_from_parquet - {df_local}')
+ return df_local
\ No newline at end of file
diff --git a/functions/save_api_results_to_file.py b/functions/save_api_results_to_file.py
new file mode 100644
index 000000000..541eeea4b
--- /dev/null
+++ b/functions/save_api_results_to_file.py
@@ -0,0 +1,71 @@
+from pathlib import Path
+import pandas as pd
+import logging
+
+def save_api_results_to_file(folder_path: str, file_name: str, df_to_save: pd.DataFrame) -> bool:
+ """
+ Function to save the results of an API call to local storage. The saved files are grouped by "query name".
+
+ Returns:
+ bool: True if ok, False if error
+ """
+ if not isinstance(folder_path, str):
+ logging.error(f'Arg "folder_path" is of type "{type(folder_path)}", expected a str')
+ return False
+ if not isinstance(file_name, str):
+ logging.error(f'Arg "file_name" is of type "{type(file_name)}", expected a str')
+ return False
+ if not isinstance(df_to_save, pd.DataFrame):
+ logging.error(f'Arg "df_to_save" is of type "{type(df_to_save)}", expected a pd.DataFrame')
+ return False
+
+ logging.debug(df_to_save.info(verbose=True))
+
+ dir_path = Path(folder_path)
+ dir_path.mkdir(parents=True, exist_ok=True)
+
+ file_path = dir_path / file_name
+
+ if not file_path.exists():
+ logging.warning(f"File not found. Initializing a new dataset for {file_path}...")
+ df_local = df_to_save.iloc[0:0].copy()
+ else:
+ try:
+ df_local = pd.read_parquet(file_path)
+ logging.debug(df_local.info(verbose=True))
+ except Exception as e:
+ logging.exception(f"Error reading the file: {file_path}")
+ return False
+
+
+ if set(df_local.columns) == set(df_to_save.columns):
+ logging.info("Structures match. Checking for new rows...")
+
+ combined = pd.concat([df_local, df_to_save], ignore_index=True)
+
+ merged = combined.drop_duplicates(subset=["UT"], keep="last")
+
+ new_rows_count = len(merged) - len(df_local)
+
+ logging.debug(merged.info(verbose=True))
+ logging.debug(merged)
+ logging.debug(new_rows_count)
+
+ if new_rows_count > 0:
+ try:
+ merged.to_parquet(file_path)
+ logging.info(f"Successfully added {new_rows_count} new rows to the file.")
+ except Exception as e:
+ logging.error('Something went wrong. Did not update file')
+ logging.error(str(e))
+ return False
+ else:
+ logging.info("No new rows to add. File is up to date.")
+
+ else:
+ logging.error(
+ "Structure mismatch! The file structure does not match the DataFrame in memory."
+ )
+ return False
+
+ return True
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index d94f94d9f..94705efb0 100644
Binary files a/requirements.txt and b/requirements.txt differ
diff --git a/www/services/format_functions.py b/www/services/format_functions.py
index 1a8ee7af4..11f8ac762 100644
--- a/www/services/format_functions.py
+++ b/www/services/format_functions.py
@@ -523,7 +523,11 @@ def format_di_column(entry, source, file_type): # Function for DI Column
doi = entry.get('DI', [''])[0]
elif source == 'PubMed':
if file_type == '.txt':
- doi = entry.get('LID', '')
+ doi_pattern = r'(.+?)\s*\[doi\]$'
+ doi_raw = entry.get('LID', '')
+ for doi_value in doi_raw.split(";"):
+ if re.match(doi_pattern, doi_value):
+ return doi_value.replace('[doi]', '').strip()
elif source == 'Scopus':
if file_type == '.bib':
doi = entry.get('doi', '')
@@ -993,6 +997,8 @@ def format_py_column(entry, source, file_type): # Function for PY Column
if file_type == '.txt':
publication_year = entry.get('DP', '')
publication_year = re.findall(r'\d{4}', publication_year)[0] if publication_year else ''
+ if publication_year != '':
+ publication_year = int(publication_year) if publication_year.isdigit() else 0
elif source == 'Scopus':
if file_type == '.bib':
publication_year = str(entry.get('year', ''))
@@ -1627,11 +1633,11 @@ def process_single_file(data, source, file_type, author):
if column not in entry_data: # Avoid overwriting existing keys
entry_data[column] = entry.get(column, None)
- # Remove the column based on the value of the 'author' field
- if author == "surname":
- entry_data.pop('AF', None) # Remove 'AF' if it exists
- elif author == "fullname":
- entry_data.pop('AU', None) # Remove 'AU' if it exists
+ # # Remove the column based on the value of the 'author' field
+ # if author == "surname":
+ # entry_data.pop('AF', None) # Remove 'AF' if it exists
+ # elif author == "fullname":
+ # entry_data.pop('AU', None) # Remove 'AU' if it exists
entries.append(entry_data)
diff --git a/www/services/openalex/__init__.py b/www/services/openalex/__init__.py
new file mode 100644
index 000000000..78883939f
--- /dev/null
+++ b/www/services/openalex/__init__.py
@@ -0,0 +1,6 @@
+import nltk
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+logging.getLogger('parser').setLevel(logging.WARNING)
+nltk.download('wordnet')
\ No newline at end of file
diff --git a/www/services/openalex/api_service.py b/www/services/openalex/api_service.py
new file mode 100644
index 000000000..6a2d7ea1e
--- /dev/null
+++ b/www/services/openalex/api_service.py
@@ -0,0 +1,46 @@
+import requests
+from .utils import *
+import pandas as pd
+from .parser import transform_from_open_alex
+
+OPEN_ALEX_KEY = "wi0R0MWb5Dy1mtZv0OMMn5"
+OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?api_key={OPEN_ALEX_KEY}&search="
+#OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?search="
+
+def search_open_alex(query_str: str, current_page:int =1, works_per_page:int =20) -> list[dict]:
+ """Contact the openalex API to retrieve information about the desired topic in JSON format
+
+ Args:
+ query_str (str): Openalex Query Search Parameter
+ current_page (int): Current page
+ works_per_page (int): Number of works per page
+
+ Returns:
+ list[dict]: Openalex results organized in a list of dicts, ready to be transformed in a DataFrame
+ """
+ if not isinstance(query_str, str):
+ raise ValueError(f'Expecting a str for "query_str", got "{type(query_str)}"')
+ if not isinstance(current_page, int) or current_page<=0:
+ raise ValueError(f'Expected a positive int for "current_page", got "{type(current_page)}"')
+ if not isinstance(works_per_page, int) or works_per_page<=0:
+ raise ValueError(f'Expected a positive int for "works_per_page", got "{type(works_per_page)}"')
+
+ full_request = f"{OPEN_ALEX_ENDPOINT}{query_str}&per_page={works_per_page}&page={current_page}"
+
+ try:
+ response = requests.get(full_request)
+ response.raise_for_status()
+ data = response.json()
+ return data
+ except Exception as e:
+ raise e
+
+def test():
+ query = "multiverse"
+ data = search_open_alex(query)
+ results = get_results_open_alex(data)
+ df = pd.DataFrame(results)
+
+
+# if __name__ == "__main__":
+# test()
\ No newline at end of file
diff --git a/www/services/openalex/pagination.py b/www/services/openalex/pagination.py
new file mode 100644
index 000000000..55498d622
--- /dev/null
+++ b/www/services/openalex/pagination.py
@@ -0,0 +1,80 @@
+from .api_service import search_open_alex
+from .parser import transform_from_open_alex
+from .utils import *
+import pandas as pd
+import logging
+
+def get_max_pages(open_alex_json: dict[dict]) -> int | str:
+ """Calculate the total number of pages available for a specific topic
+
+ Args:
+ open_alex_json (list[dict]): Openalex json file from which to extract metadata information
+
+ Results:
+ int | str: The maximum number of pages for a topic. If metadata is not available will return str"""
+
+ logging.debug('Invoked get_max_pages\n')
+ logging.debug(f'Arg: open_alex_json. Arg type: {type(open_alex_json)}\n')
+
+ if not isinstance(open_alex_json, dict):
+ logging.error(f'Expected a dict for arg "open_alex_json", got "{type(open_alex_json)}".')
+ return "Metadata unavailable"
+
+ metadata = open_alex_json.get('meta')
+ logging.debug(f'Metadata:\n{metadata}')
+
+ if not metadata:
+ logging.error('No metadata available.')
+ return "Metadata unavailable"
+
+ works_count = metadata.get('count')
+ works_per_page = metadata.get('per_page')
+
+ if not works_count or not works_per_page:
+ logging.error(f'Malformed metadata.')
+ return "Metadata unavailable"
+
+ if not isinstance(works_count, int) or not isinstance(works_per_page, int):
+ logging.error(f'Expected int for "works_count" and "works_per_page", got "{type(works_count)}" and "{type(works_per_page)}".')
+ return "Metadata unavailable"
+
+ return works_count//works_per_page
+
+
+def transform_to_df(open_alex_json: dict[dict]) -> pd.DataFrame:
+ """Transform an openalex json to a pandas dataframe
+
+ Args:
+ open_alex_json (list[dict]): Openalex json file to convert to DataFrame
+
+ Results:
+ pd.DataFrame: OPenalex query results converted to pandas DataFrame
+ """
+ if not isinstance(open_alex_json, dict):
+ raise ValueError(f'Expected a dict for arg "open_alex_json", got "{type(open_alex_json)}".')
+
+ json_results = open_alex_json.get('results')
+
+ if not isinstance(json_results, list):
+ raise ValueError(f'Expected a list for field "results" in JSON. Got "{type(json_results)}"')
+
+ return transform_from_open_alex(pd.DataFrame(json_results))
+
+# def get_data_from_query_open_alex(query):
+# data = search_open_alex(query)
+# total_results = get_count_open_alex(data)
+
+# max_pages = (total_results // 20)
+# return max_pages
+
+# def paginated_papers_open_alex(query, page):
+# data = search_open_alex(query, page=page)
+# results_json = get_results_open_alex(data)
+# results_df = pd.DataFrame(results_json)
+
+# return transform_from_open_alex(results_df)
+
+
+
+
+
\ No newline at end of file
diff --git a/www/services/openalex/parser.py b/www/services/openalex/parser.py
new file mode 100644
index 000000000..9ff5e413d
--- /dev/null
+++ b/www/services/openalex/parser.py
@@ -0,0 +1,493 @@
+import pandas as pd
+import json
+import numpy as np
+from iso4 import abbreviate
+import nltk
+import logging
+from itertools import chain
+
+
+def _fetch_value(data: pd.Series, keys:list[str], is_return_list=False):
+ '''Retrieve a value for the specified key-path in a pandas' Series object.
+ This is intended to be used on a JSON object converted to DataFrame object.
+
+ Args:
+ data (pd.Series): Pandas' Series object from which to retrieve the information
+ keys (list[str]): Ordered keys from top level to desired level. Eg. ['Top-level', 'First-nested level', 'Second-nested level', ...]
+ is_return_list (Bool): Specify if empty return must be a list or a str
+
+ Returns:
+ Any | [] | "": Nested value associated with the specified key or empty list/str for invalid results'''
+
+ if not isinstance(data, pd.Series):
+ raise ValueError(f'Data is of type {type(data)}. Must be pandas.Series!')
+
+ if not isinstance(keys, list):
+ raise ValueError(f'Keys is of type {type(keys)}. Must be list of strings!')
+
+ for key in keys:
+ if not isinstance(key, str):
+ raise ValueError(f'Key: {key} from keys: {keys} is of type: {type(key)}. Must be string!')
+
+ empty = [] if is_return_list else ""
+ value = data.to_dict()
+
+ for key in keys:
+ if not isinstance(value, dict):
+ logging.warning(
+ f'Expected a dict, got {type(value)} for key "{key}" in keys "{keys}" for data:\n{data}'
+ f'Returning empty {"list" if is_return_list else "string"}.'
+ )
+ return empty
+
+ value = value.get(key)
+
+ if value is None or (not isinstance(value, (list, dict)) and pd.isna(value)):
+ logging.warning(
+ f'Value is missing or NaN for key "{key}" in keys: "{keys}" in data:\n{data}'
+ f'Returning empty {"list" if is_return_list else "string"}.'
+ )
+ return empty
+
+ return value
+
+
+def _format_author_name(display_name: str) -> str:
+ """Convert 'Firstname [Middlename] Lastname' to 'Lastname FI' format.
+
+ Args:
+ display_name (str): Author full-name to convert
+
+ Returns:
+ str: Author name converted to 'Lastname FI' format
+ """
+ if not isinstance(display_name, str):
+ raise ValueError(f'Expected string for display_name "{display_name}". Got {type(display_name)}')
+
+ if not display_name or not display_name.strip():
+ logging.warning(f'Display_name "{display_name}" is empty. Returning "" ')
+ return ""
+
+ parts = display_name.strip().split()
+
+ if len(parts) == 1:
+ return parts[0] # single name, return as-is
+
+ surname = parts[-1]
+ initials = "".join(f"{p[0]}." for p in parts[:-1])
+
+ return f"{surname} {initials}"
+
+
+def _calculate_JI(name_to_abbreviate: str) -> str:
+ """Calculate the abbreviated form of the Journal name following ISO4 standard.
+
+ Args:
+ name_to_abbreviate (str): Journal name to abbreviate
+
+ Returns:
+ str: Abbreviated Journal name"""
+ logging.debug(f'Invoked _calculate_JI\nArg type:{type(name_to_abbreviate)}\nArg:\n{name_to_abbreviate}\n\n')
+
+ if not isinstance(name_to_abbreviate, str):
+ raise ValueError(f'Expected string for name_to_abbreviate "{name_to_abbreviate}". Got {type(name_to_abbreviate)}')
+
+ if not name_to_abbreviate or not name_to_abbreviate.strip():
+ logging.warning(f'name_to_abbreviate "{name_to_abbreviate}" is empty. Returning "" ')
+ return ""
+
+ try:
+ return(abbreviate(name_to_abbreviate))
+
+ except Exception as e:
+ logging.warning(f'WARNING! Got exception \n{e}\nReturning empty value')
+ return ""
+
+
+def _calculate_AU_or_AF(authorship_list: list[dict], fullname=False) -> list[str]:
+ """Retrieve the names of the authors in either short or fullname format
+
+ Args:
+ authorship_list (list[dict]): authorships field in the open_alex json response
+ fullname (bool): Flag for returning fullname or short name
+
+ Returns:
+ list[str]: List containing the names of the authors of the work"""
+ logging.debug(f'Invoked _calculate_AU_or_AF\nArg type:{type(authorship_list)}\nArg:\n{authorship_list}\n\n')
+
+ if not isinstance(authorship_list, list):
+ logging.warning(f'Expectinga list, (got {type(authorship_list)}). \nAuthorships_list: {authorship_list}\nReturning empty list')
+ return []
+
+ if not authorship_list or authorship_list is None:
+ logging.warning('Authorship_list is empty or None. Returning empty list')
+ return []
+
+ authors_list = []
+
+ for authorship in authorship_list:
+ if not isinstance(authorship, dict):
+ logging.warning(f'Expected a dict, got {type(authorship)}')
+ continue
+
+ author = authorship.get('author')
+ if not isinstance(author, dict):
+ logging.warning(f'Expected a dict, got {type(author)}')
+ continue
+
+ author_name = author.get('display_name')
+ if author_name is None or not author_name.strip():
+ logging.warning(f'Missing display_name in authorship entry: {authorship}')
+ continue
+
+ authors_list.append(_format_author_name(author_name) if not fullname else author_name)
+
+ return authors_list
+
+
+def _calculate_C1(authorships_list: list[dict]) -> list[str]:
+ """
+ Calculate authors affiliations and returns them as a list of strings.
+
+ Args:
+ authorships_list (list[dict]): OpenAlex's response Json's authorship field
+
+ Returns:
+ list[str]: A list of the authors affiliations
+ """
+ logging.debug(f'Invoked _calculate_C1\nArg type:{type(authorships_list)}\nArg:\n{authorships_list}\n')
+
+ if not isinstance(authorships_list, list):
+ logging.warning(f'Expected a list, got {type(authorships_list)}: {authorships_list}\nReturning empty list\n')
+ return []
+
+ if not authorships_list:
+ logging.warning(f'Empty authorships list. Returning empty list\n')
+ return []
+
+ affiliations_set = set()
+ for authorship in authorships_list:
+ if not isinstance(authorship, dict):
+ logging.warning(f'Expected a dict, got "{type(authorship)}"')
+ continue
+
+ raw_affiliation_strings = authorship.get('raw_affiliation_strings')
+ if not isinstance(raw_affiliation_strings, list):
+ logging.warning(
+ f'Expected a list for raw_affiliation_strings, '
+ f'got {type(raw_affiliation_strings)}: {authorship}\n'
+ )
+ continue
+
+ for affiliation_string in raw_affiliation_strings:
+ if not isinstance(affiliation_string, str):
+ logging.warning(f'Expected a str, got "{type(affiliation_string)}". Skipping...')
+ continue
+ affiliations_set.add(affiliation_string)
+
+ return list(affiliations_set)
+
+
+def _get_first_authorship(authorships_list: list[dict]) -> dict | None:
+ '''
+ Helper function to get the first authorship of an authorship list if present.
+
+ Args:
+ authorships_list (list[dict]): An authorships object from an open_alex.json search results file
+
+ Returns:
+ dict | None: The authorship if present, otherwise None
+ '''
+
+ logging.debug(f'Invoked _get_first_authorship\nArg type:{type(authorships_list)}\nArg:\n{authorships_list}\n')
+
+ if not isinstance(authorships_list, list):
+ logging.warning(f'Expected a list, got {type(authorships_list)}: {authorships_list}\nReturning None\n')
+ return None
+
+ if not authorships_list:
+ logging.warning('Empty authorships list. Returning None\n')
+ return None
+
+ first_authorship = None
+ for authorship in authorships_list: #Loop to find first author
+
+ if not isinstance(authorship, dict):
+ logging.warning(f'Expecting a dict, got "{type(authorship)}". Skipping...\n')
+ continue
+
+ author_position = authorship.get('author_position')
+ if not isinstance(author_position, str):
+ logging.warning(f'Expected a str for author_position, got "{type(author_position)}". Skipping... ')
+ continue
+
+ if author_position == 'first':
+ first_authorship = authorship
+ break
+
+ if not first_authorship:
+ logging.warning(f'No first author found in {authorships_list}\nReturning None')
+ return None
+
+ return first_authorship
+
+
+def _calculate_RP(authorships_list: list[dict]) -> str:
+ '''
+ Create the reprint address for the first author.
+ (CORRESPONDING AUTHOR)
+ If author is missing will return empty string. If affiliation is missing, will return
+ just the author.
+
+ Args:
+ authorships_list (list[dict]): OpenAlex's Json response's authorship field
+ Returns:
+ "" | str: A string containing the first author's name and first affiliation address if present
+ '''
+ logging.debug(f'Invoked _calculate_RP\nArg type:{type(authorships_list)}\nArg:\n{authorships_list}\n')
+
+ if not isinstance(authorships_list, list):
+ logging.warning(f'Expected a list, got {type(authorships_list)}: {authorships_list}\nReturning empty str\n')
+ return ""
+
+ if not authorships_list:
+ logging.warning(f'Empty authorships list. Returning empty str\n')
+ return ""
+
+ first_authorship = _get_first_authorship(authorships_list)
+
+ if not first_authorship:
+ logging.warning(f'No first authorship found! \n{authorships_list}\nReturning empty str')
+ return ""
+
+ author_name = first_authorship.get('raw_author_name')
+ author_affiliations = first_authorship.get('raw_affiliation_strings')
+
+ if not isinstance(author_name, str):
+ logging.warning(
+ 'Expected a str for raw_author_name, '
+ f'got {type(author_name)}: {first_authorship}\n'
+ )
+ return ""
+
+ reprint_address = f'{_format_author_name(author_name)} (CORRESPONDING AUTHOR)'
+
+ if not isinstance(author_affiliations, list) or not author_affiliations:
+ logging.warning(f'Expected a non-empty list, got "{type(author_affiliations)}". Returning incomplete reprint address "{reprint_address}"')
+ return reprint_address
+
+ first_affiliation = author_affiliations[0]
+ if not isinstance(first_affiliation, str):
+ logging.warning(f'Expected a str, got "{type(first_affiliation)}". Returning incomplete reprint address "{reprint_address}"')
+ return reprint_address
+
+ reprint_address += f" {first_affiliation}"
+
+ return reprint_address
+
+
+def _calculate_DE_and_ID(keyword_list: list[dict]) -> list[str]:
+ '''
+ Extract the keywords from Open_alex's json response's keyword field and return them as a list of str
+ Args:
+ keyword_list (list[dict]): OpenAlex's keyword field
+ Returns:
+ list[str]: A list containing the work's keywords
+ '''
+ logging.debug(f'Invoked _calculate_DE_and_ID\nArg type:{type(keyword_list)}\nArg:\n{keyword_list}\n')
+
+ if not isinstance(keyword_list, list):
+ logging.warning(f'Expected a list, got {type(keyword_list)}: {keyword_list}\nReturning empty list\n')
+ return []
+
+ if not keyword_list:
+ logging.warning('Empty keywords list. Returning empty list\n')
+ return []
+
+ result = []
+ for keyword_dict in keyword_list:
+ if not isinstance(keyword_dict, dict):
+ logging.warning(f'Expected a dict, got "{type(keyword_dict)}" Skipping...')
+ continue
+
+ keyword = keyword_dict.get('display_name')
+
+ if not isinstance(keyword, str) or not keyword.strip():
+ logging.warning(f'Expected a non-empty str, got "{type(keyword)}" Skipping...')
+ continue
+
+ result.append(keyword)
+
+
+ return result
+
+
+def _calculate_AB(abstract_inverted_index: dict) -> str:
+ '''
+ This function takes as input open_alex's json responses's "abstarct_inverted_index" field, which is a dict with words as keys
+ and index position as values. It returns the reconstructed sentence.
+ Args:
+ abstract_inverted_index (dict): A dictionary containing words as keys and their index postion as values
+ Returns:
+ str: A string of the reconstructed abstract
+ '''
+
+ logging.debug(f'Invoked _calculate_AB\nArg type:{type(abstract_inverted_index)}\nArg:\n{abstract_inverted_index}\n')
+
+ if not isinstance(abstract_inverted_index, dict):
+ logging.warning(f'Expected a dict, got {type(abstract_inverted_index)}: {abstract_inverted_index}\nReturning empty str\n')
+ return ""
+
+ if not abstract_inverted_index:
+ logging.warning(f'Dictionary is empty: {abstract_inverted_index}\nReturning empty str\n')
+ return ""
+
+ try:
+ max_index = max(list(chain(*abstract_inverted_index.values())))
+ abstract_template = [""] * (max_index+1)
+
+ for word, index_list in abstract_inverted_index.items():
+ for index in index_list:
+ abstract_template[index] = word
+
+ abstract = " ".join(abstract_template)
+ except ValueError as e:
+ logging.warning(f'Unexpected error!\n\n {e} \n\nReturning empty str')
+ return ""
+
+ return abstract
+
+
+def _calculate_SR(first_authorship: dict, release_year: int, journal_name: str) -> str:
+ '''
+ Calculate the Short Reference -> ", , "
+ If some parts are missing they will be excluded from the result
+
+ Args:
+ first_authorship (dict): The first authorship dictionary
+ release_year (int): The release year of the work
+ journal_name (str): The full name of the journal
+
+ Returns:
+ The calculated short reference as a str
+ '''
+ logging.debug(f'Invoked _calculate_SR\nArg type:{type(first_authorship)}\nArg:\n{first_authorship}\n')
+ logging.debug(f'Arg type:{type(release_year)}\nArg:\n{release_year}\nArg type:{type(journal_name)}\nArg:\n{journal_name}\n')
+
+ is_release_year_valid = True
+ is_journal_name_valid = True
+
+
+ # Correct type checks
+ if not isinstance(first_authorship, dict):
+ logging.warning(f'Expected a dict for first_authorship, got a {type(first_authorship)} -> {first_authorship}\nReturning empty str')
+ return ""
+ if not isinstance(release_year, int):
+ logging.warning(f'Expected an int for release_year, got a {type(release_year)} -> {release_year}\nOmitting it in the result.')
+ is_release_year_valid = False
+ if not isinstance(journal_name, str):
+ logging.warning(f'Expected a str for journal_name, got a {type(journal_name)} -> {journal_name}\nOmitting it in the result.')
+ is_journal_name_valid = False
+
+ # Empty args check
+ if not first_authorship:
+ logging.warning('Empty first_authorship value. Returning empty str')
+ return ""
+ if not is_journal_name_valid or not journal_name:
+ logging.warning('Empty journal_name value. Omitting it in the result')
+ is_journal_name_valid = False
+
+ # Fetch first author name
+ author_name = first_authorship.get('raw_author_name')
+ if not isinstance(author_name, str) or not author_name.strip():
+ logging.warning(f'First author name could not be found! {first_authorship}\n Returning empty str')
+ return ""
+ author_name = _format_author_name(author_name)
+
+ short_reference_template = [author_name]
+ if is_release_year_valid:
+ short_reference_template.append(str(release_year))
+ if is_journal_name_valid:
+ short_reference_template.append(journal_name)
+
+ short_reference = ", ".join(short_reference_template)
+
+ return short_reference
+
+
+
+def transform_from_open_alex(input_df: pd.DataFrame) -> pd.DataFrame:
+ """Transformation function to convert an Open_alex JSON to the bibliometrix standard format.
+
+ Args:
+ input_df (pd.DataFrame): Open_alex JSON results converted to Pandas DataFrame
+
+ Returns:
+ pd.DataFrame: Bibliometrix standard format DataFrame"""
+
+ logging.debug(f'Invoked "transform_from_open_alex".\n{input_df}\n\n')
+
+ if not isinstance(input_df, pd.DataFrame):
+ raise ValueError(f'Expected a pd.DataFrame, got "{type(input_df)}"!')
+
+ result = []
+
+ for index, row in input_df.iterrows():
+ try:
+ authorships = _fetch_value(row, ['authorships'])
+ keywords = _fetch_value(row, ['keywords'], is_return_list=True)
+
+ row_template = {
+ "DB": "open_alex",
+ "UT": _fetch_value(row, ["id"]),
+ "DI": _fetch_value(row, ["doi"]),
+ "PMID": _fetch_value(row, ["ids", "pmid"]),
+ "TI": _fetch_value(row, ["title"]),
+ "SO": _fetch_value(row, ['primary_location', 'source', 'display_name']),
+ "JI": _calculate_JI(_fetch_value(row, ['primary_location', 'source', 'display_name'])),
+ "PY": _fetch_value(row, ['publication_year']),
+ "DT": _fetch_value(row, ['type']),
+ "LA": _fetch_value(row, ['language']),
+ "TC": _fetch_value(row, ['cited_by_count']),
+ "AU": _calculate_AU_or_AF(authorships),
+ "AF": _calculate_AU_or_AF(authorships, fullname=True),
+ "C1": _calculate_C1(authorships),
+ "RP": _calculate_RP(authorships),
+ "CR": _fetch_value(row,[ 'referenced_works'], is_return_list=True),
+ "DE": _calculate_DE_and_ID(keywords),
+ "ID": _calculate_DE_and_ID(keywords),
+ "AB": _calculate_AB(_fetch_value(row, ['abstract_inverted_index'])),
+ "VL": _fetch_value(row, ['biblio', 'volume']),
+ "IS": _fetch_value(row, ['biblio', 'issue']),
+ "BP": _fetch_value(row, ['biblio', 'first_page']),
+ "EP": _fetch_value(row, ['biblio', 'last_page']),
+ "SR": _calculate_SR(
+ first_authorship=_get_first_authorship(authorships),
+ release_year=_fetch_value(row, ['publication_year']),
+ journal_name=_fetch_value(row, ['primary_location', 'source', 'display_name'])
+ )
+ }
+
+ #logging.debug(f'Row "#{index}":\n{row_template}\n\n')
+
+ result.append(row_template)
+
+ except Exception as e:
+ logging.warning(f'Omitting row "#{index}" due to exception:\n{e}\n\n')
+
+ return pd.DataFrame(result)
+
+#nltk.download('wordnet')
+
+# def transform_to_df(data):
+# return pd.DataFrame(data)
+
+# def _load_json_from_file(filename):
+# with open(filename, 'r') as f:
+# data = json.load(f)
+# return data
+# input_df = transform_to_df(_load_json_from_file('open_alex_motorcycle_results.json'))
+# output_df = transform_from_open_alex(input_df)
+# print(output_df.head(20))
+# with open('open_alex_result.csv', 'w', encoding='utf-8') as f: # Print to file to check
+# f.write(output_df.to_csv())
\ No newline at end of file
diff --git a/www/services/openalex/utils.py b/www/services/openalex/utils.py
new file mode 100644
index 000000000..9d7a4e443
--- /dev/null
+++ b/www/services/openalex/utils.py
@@ -0,0 +1,8 @@
+def get_count_open_alex(data : dict) -> int:
+ return data["meta"]["count"]
+
+def get_page_open_alex(data : dict) -> str:
+ return data["meta"]["page"]
+
+def get_results_open_alex(data : dict) -> dict:
+ return data["results"]
\ No newline at end of file
diff --git a/www/services/parsers.py b/www/services/parsers.py
index 72b9d370e..c65da921c 100644
--- a/www/services/parsers.py
+++ b/www/services/parsers.py
@@ -1,4 +1,4 @@
-from .utils import *
+from www.services.utils import *
#### WEB OF SCIENCE PARSER ####
@@ -38,40 +38,39 @@ def parse_wos_data(datapath): # PARSER FOR WEB OF SCIENCE TXT and CIW
return elem_data
-
#### PUBMED PARSER ####
def parse_pubmed_data(datapath): # PARSER FOR PUBMED TXT
- data = []
- current_record = {}
-
+ data = []
with open(datapath, 'r', encoding='utf-8') as file:
- lines = file.readlines()
+ file_data = file.read()
+ paper_begin_pattern = r'(?=PMID\s*-\s*\d+)'
+ papers = re.split(paper_begin_pattern, file_data)
- for line in lines:
- # line = line.decode('utf-8') # Decode the line from bytes to string
- if line.strip() == '':
- # If the line is empty, add the current record to the data
- if current_record:
- data.append(current_record)
- current_record = {}
- continue
-
- key_match = re.match(r'^([A-Z]+)\s*-\s*(.+)', line)
- if key_match:
- key = key_match.group(1)
- value = key_match.group(2)
-
- if key in current_record:
- current_record[key] += ';' + value
+ for paper in papers:
+ current_record = {}
+ lines = paper.split("\n")
+ for line in lines:
+ # line = line.decode('utf-8') # Decode the line from bytes to string
+ if line.strip() == '':
+ # If the line is empty, skip it
+ continue
+
+ key_match = re.match(r'^([A-Z]+)\s*-\s*(.+)', line)
+ if key_match:
+ key = key_match.group(1)
+ value = key_match.group(2)
+
+ if key in current_record:
+ current_record[key] += ';' + value
+ else:
+ current_record[key] = value
else:
- current_record[key] = value
- else:
- # Add the content to the previous key
- current_record[key] += ' ' + line.strip()
+ # Add the content to the previous key
+ current_record[key] += ' ' + line.strip()
- # Add the last record if present
- if current_record:
- data.append(current_record)
+ # Add the last record if present
+ if current_record:
+ data.append(current_record)
return data
diff --git a/www/services/pubmed_api/api_service.py b/www/services/pubmed_api/api_service.py
new file mode 100644
index 000000000..33f0f4b56
--- /dev/null
+++ b/www/services/pubmed_api/api_service.py
@@ -0,0 +1,46 @@
+import requests
+import pandas as pd
+
+PUBMED_SEARCH_ENDPOINT = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+PUBMED_FETCH_ENDPOINT = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+
+def search_pubmed(query, retmax=20):
+ params = {
+ "db": "pubmed",
+ "retmax": retmax,
+ "term": query,
+ "usehistory": "y"
+ }
+ response = requests.get(PUBMED_SEARCH_ENDPOINT, params=params)
+ response.raise_for_status()
+ data = response.text
+ return data
+
+def search_webenv(webenvId, query_key, restart=0, retmax=20):
+ params = {
+ "db": "pubmed",
+ "query_key": query_key,
+ "WebEnv": webenvId,
+ "retstart": restart,
+ "retmax": retmax
+ }
+ response = requests.get(PUBMED_SEARCH_ENDPOINT, params=params)
+ response.raise_for_status()
+ data = response.text
+ return data
+
+
+def fetch_pubmed(webEnv, query_key, restart=0, retmax=100):
+ params = {
+ "db": "pubmed",
+ "query_key": query_key,
+ "WebEnv": webEnv,
+ "retmode": "text",
+ "rettype": "medline",
+ "retstart": restart,
+ "retmax": retmax
+ }
+ response = requests.get(PUBMED_FETCH_ENDPOINT, params=params)
+ response.raise_for_status()
+ data = response.text
+ return data
\ No newline at end of file
diff --git a/www/services/pubmed_api/pagination.py b/www/services/pubmed_api/pagination.py
new file mode 100644
index 000000000..37a47f756
--- /dev/null
+++ b/www/services/pubmed_api/pagination.py
@@ -0,0 +1,30 @@
+from .api_service import fetch_pubmed
+import os
+import pandas as pd
+from www.services.format_functions import process_single_file
+
+# Required columns
+DF_COLUMNS = ["DB", "UT", "DI", "PMID", "TI", "SO", "JI", "PY", "DT", "LA", "TC", "AU", "AF", "C1", "RP", "CR", "DE", "ID", "AB", "VL", "IS", "BP", "EP", "SR"]
+
+def create_df_from_pubmed_data(papers_dict_list):
+ def _remove_uneeded_fields(old_dict):
+ return {k: old_dict[k] for k in DF_COLUMNS if k in old_dict}
+
+ list_of_dicts = [_remove_uneeded_fields(paper) for paper in papers_dict_list]
+ return pd.DataFrame(list_of_dicts)
+
+# Parsing papers data using the library's parser
+def _get_bibliometrix_parsed_data_file(papers_data):
+ with open("pubmed.txt", "w", encoding='utf-8') as f:
+ f.write(str(papers_data))
+ return process_single_file(os.path.abspath(f.name), "pubmed", ".txt", "fullname")
+
+def get_paginated_papers_df(webEnv, query_key, page=0, retmax=20):
+ if page > 0:
+ page -= 1 # pubmed starts from 0 index
+ restart = page * retmax
+ papers_data = fetch_pubmed(webEnv, query_key, restart=restart, retmax=retmax)
+ papers_dict_list = _get_bibliometrix_parsed_data_file(papers_data)
+ with open("biblio_parsed.txt", "w") as ff:
+ ff.write(str(papers_dict_list))
+ return create_df_from_pubmed_data(papers_dict_list)
\ No newline at end of file
diff --git a/www/services/pubmed_api/xml_parser.py b/www/services/pubmed_api/xml_parser.py
new file mode 100644
index 000000000..6e08b2c29
--- /dev/null
+++ b/www/services/pubmed_api/xml_parser.py
@@ -0,0 +1,30 @@
+import xml.etree.ElementTree as ET
+
+def _get_ids_from_xml(xml_data):
+ tree = ET.fromstring(xml_data)
+ tree = tree.find("IdList")
+ return [id_elem.text for id_elem in tree.findall("Id")]
+
+def _get_count(xml_data):
+ tree = ET.fromstring(xml_data)
+ tree = tree.find("Count")
+ return int(tree.text)
+
+def _get_webenv(xml_data):
+ tree = ET.fromstring(xml_data)
+ tree = tree.find("WebEnv")
+ return tree.text
+
+def _get_query_key(xml_data):
+ tree = ET.fromstring(xml_data)
+ tree = tree.find("QueryKey")
+ return tree.text
+
+
+def get_data_from_query(xml_data, retmax):
+ count = _get_count(xml_data)
+ number_pages = (count // retmax) + 1 if count % retmax != 0 else (count // retmax)
+ webEnv = _get_webenv(xml_data)
+ query_key = _get_query_key(xml_data)
+
+ return number_pages, webEnv, query_key