From 547d07cd1919d184c747dc6b2bd0311857e46055 Mon Sep 17 00:00:00 2001 From: nixxdd Date: Wed, 17 Jun 2026 20:23:52 +0200 Subject: [PATCH] Added Pubmed API service and OpenAlex API service to the project. Adding them to the interface --- .gitignore | 9 +- app.py | 480 ++++++++++++++++++++++-- functions/__init__.py | 4 +- functions/get_database.py | 57 +-- functions/get_localcitedsources.py | 7 +- functions/get_table.py | 20 +- functions/load_df_from_parquet.py | 22 ++ functions/save_api_results_to_file.py | 71 ++++ requirements.txt | Bin 4626 -> 4840 bytes www/services/format_functions.py | 18 +- www/services/openalex/__init__.py | 6 + www/services/openalex/api_service.py | 46 +++ www/services/openalex/pagination.py | 80 ++++ www/services/openalex/parser.py | 493 +++++++++++++++++++++++++ www/services/openalex/utils.py | 8 + www/services/parsers.py | 57 ++- www/services/pubmed_api/api_service.py | 46 +++ www/services/pubmed_api/pagination.py | 30 ++ www/services/pubmed_api/xml_parser.py | 30 ++ 19 files changed, 1394 insertions(+), 90 deletions(-) create mode 100644 functions/load_df_from_parquet.py create mode 100644 functions/save_api_results_to_file.py create mode 100644 www/services/openalex/__init__.py create mode 100644 www/services/openalex/api_service.py create mode 100644 www/services/openalex/pagination.py create mode 100644 www/services/openalex/parser.py create mode 100644 www/services/openalex/utils.py create mode 100644 www/services/pubmed_api/api_service.py create mode 100644 www/services/pubmed_api/pagination.py create mode 100644 www/services/pubmed_api/xml_parser.py diff --git a/.gitignore b/.gitignore index 23b99e089..b7e889daa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,11 @@ +.DS_Store __pycache__/ +biblio_parsed.txt +pubmed.txt +pubmed_data.csv +pubmed_data.csv bibliovenv/ Bibenv/ -.idea/ \ No newline at end of file +.idea/ +.venv +data/ diff --git a/app.py b/app.py index f0891f894..350da386c 100644 --- a/app.py +++ b/app.py @@ -45,13 +45,13 @@ # ----- # Author: PRAISELab Team - # Import necessary libraries for better performance - avoid importing everything import tempfile import os import requests import functools from datetime import datetime +import traceback import pandas as pd import io from functions import * @@ -64,6 +64,33 @@ from shinywidgets import render_widget from shiny.express import ui, input, render +## pubmed API imports +from www.services.pubmed_api.api_service import search_pubmed, fetch_pubmed +from www.services.pubmed_api.pagination import create_df_from_pubmed_data, get_paginated_papers_df +from www.services.pubmed_api.xml_parser import get_data_from_query + +## openalex API imports +from www.services.openalex.api_service import search_open_alex +from www.services.openalex.parser import transform_from_open_alex +from www.services.openalex.pagination import * +from www.services.openalex.utils import * + +# JS Snippet to fix caching issue auto loading the last value of "select" +ui.tags.script(""" +document.addEventListener('DOMContentLoaded', function() { + var sel = document.getElementById('select'); + if (sel) { + sel.setAttribute('autocomplete', 'off'); + // Force back to the intended default on every fresh load + sel.value = '0'; + // Notify Shiny's reactive system the value changed (in case it was already '0' silently) + var event = new Event('change', { bubbles: true }); + sel.dispatchEvent(event); + } +}); +""") + + # Setup the Directory for static assets - optimized for performance base_dir = tempfile.gettempdir() # Use system temp dir instead of creating new temp file express.app_opts(static_assets=base_dir, debug=False) @@ -83,6 +110,11 @@ # Include custom CSS for the app's appearance. ui.include_css("www/static/biblioshiny.css") + +start_trigger = reactive.Value(0) +api_source = reactive.Value(None) +df = reactive.Value(None) + # --- Header --- # The header bar contains the logo, app name, and a set of dropdown menus for notifications, help, donations, and credits. with ui.tags.div(class_="header-bar"): @@ -575,7 +607,6 @@ def get_latest_cran_version(): ui.h3("📊 Data Management", style="color: #5567BB;") ui.p("Easily import, load, or export your dataset.") # ---------- INITIALIZE VARIABLES ---------- - df = reactive.Value(None) # Optimized function to reset analysis results when dataset changes def reset_all_analyses(): @@ -641,17 +672,20 @@ def create_loading_modal(analysis_type="analysis"): with ui.layout_sidebar(fillable=False, fill=False): # Sidebar for data import options with ui.sidebar(id="sidebar_load_data", position="right" ): - # Section for Import or Load + # Section for Import or Load ui.h5("Data Import Options", style="color: #5567BB;") ui.input_select( - "select", - "Choose an action:", - { - "": "-", + id= "select", + label="Choose an action:", + choices={ + "0": "-", "1A": "Import raw data file(s)", "1B": "Load Bibliometrix file(s)", - "1C": "Use a sample dataset" + "1C": "Use a sample dataset", + "1D": "API Import", + "1E": "Load locally saved API results" }, + selected="0" ) @render.express() @@ -711,7 +745,19 @@ def select_db(): ui.input_action_button("start_button", "Start", icon=ICONS["play"]) ui.markdown("Select a predefined sample dataset for testing purposes.") - else: + elif input.select() == "1D": + ui.p("Redirecting to the API Import tab.", style="color: gray;") + + elif input.select() == "1E": + ui.input_file( + "parquet_dataset", + "Choose a File", + accept=[".pqt"] + ) + ui.p("Load a saved parquet file from data folder", style="color: gray; font-size: 10px; margin-top: -20px;") + ui.input_action_button("load_parquet_file", "Start", icon=ICONS["play"]) + + else: ui.p("Please select a valid action to begin managing your data.", style="color: gray;") ui.p("Follow the instructions below to manage your data efficiently:") ui.markdown( @@ -738,30 +784,92 @@ def select_db(): # ) # ui.input_action_button("export_button", "Export", icon=ICONS["download"], disabled=True) - @render.express() + @reactive.effect + @reactive.event(input.select) + def _redirect_to_api(): + if input.select() == "1D": + ui.update_navs("hidden_tabs", selected="API") + + @reactive.effect + @reactive.event(input.load_parquet_file) + def load_local_parquet_file(): + logging.debug('App.py - Invoked "load_local_parquet_file()"') + if input.select() == "1E": + logging.debug('App.py - invoked "load_local_parquet_file()"') + logging.debug(input.parquet_dataset()) + df_from_disk = load_df_from_parquet(filepath=input.parquet_dataset()[0]["datapath"]) + df.set(df_from_disk) + + # Trigger for opening analysis page + if start_trigger.get() == 0: + start_trigger.set(1) + + ui.update_navs(id='hidden_tabs', selected='import') + + + @reactive.effect + @reactive.event(input.select) + def _ (): + logging.debug(f'App.py - Select value: "{input.select()}"') + + @reactive.effect @reactive.event(input.start_button) + def _trigger_from_file(): + logging.debug('App.py - Invoked "_trigger_from_file()"') + start_trigger.set(1) + #ui.update_navs(id='hidden_tabs', selected='import') + + @render.express() def mostra(): - database = get_database(input) + logging.debug('App.py - Invoked mostra() function') + trigger = start_trigger.get() + if trigger == 0: + return + logging.debug(f'Mostra function - trigger value: "{trigger}"') + + ui.HTML(init_itables()) + + + source = api_source.get() + if source is not None: + database = "PubMed" if source == "pubmed" else "OpenAlex" + #api_source.set(None) # reset per la prossima volta + else: + database = get_database(input) + # database = get_database(input) ui.update_sidebar("sidebar_load_data", show=False) ui.update_action_button("export_button", disabled=False) ui.markdown(f"

Data of {database}

") - if database == "Sample": - data = df.set(pd.read_excel("sources/samples/sample.xlsx")) - reset_all_analyses() # Reset analysis results when sample is loaded - - @render.express() - @reactive.event(input.Dataset) - def show_data(): - text = get_data(input, database, df, reset_all_analyses) - text - ui.HTML(init_itables()) - - @render.ui - @reactive.event(input.start_button) - def show_table(): + logging.debug('Invoked show_table function...') + table_ui = ui.h5("No data available.") + try: table_ui, _, _ = get_table(database, df) - return table_ui + except Exception as e: + logging.error(e, exc_info=True) + table_ui + + # if database == "Sample": + # data = df.set(pd.read_excel("sources/samples/sample.xlsx")) + # reset_all_analyses() # Reset analysis results when sample is loaded + + # @render.express() + # @reactive.event(input.Dataset) + # def show_data(): + # text = get_data(input, database, df, reset_all_analyses) + # #text + + + # @render.ui + # @reactive.event(lambda: start_trigger.get()) + # def show_table(): + # logging.debug('Invoked show_table function...') + # table_ui = ui.h5("No data available.") + # try: + # table_ui, _, _ = get_table(database, df) + # except Exception as e: + # logging.error(e, exc_info=True) + # return table_ui # -------- ADVICE BUTTON -------- @render.ui @@ -853,9 +961,321 @@ def indicator_types_ui_all(): """ ), - with ui.nav_panel("None", value="API"): - ui.h3("🚧 Warning: API is under construction 🚧") + ## API SECTION ## + PAGINATION_LIMIT = 10000 + + current_query = reactive.Value(None) + works_per_page = reactive.Value(20) + search_results = reactive.Value(None) + full_results = reactive.Value(None) + max_number_pages = reactive.Value(None) + web_env_store = reactive.Value(None) + query_key_store = reactive.Value(1) + current_page = reactive.Value(0) + is_pubmed = reactive.Value(True) + page_limit = reactive.Value(None) + go_to_page_button_flag = reactive.Value(True) + + with ui.nav_panel("None", value="API"): + + ui.h3("PubMed or OpenAlex API", style="color: #5567BB;") + ui.input_select( + "api_select", + "Choose an API to import data from:", + { + "": "-", + "pubmed": "PubMed API", + "openalex": "OpenAlex API" + }, + ) + + @render.express() + def render_api_selection(): + if input.api_select() == "pubmed": + ui.h4("PubMed Search", style="color: #5567BB; margin-top: 20px;") + ui.p("Import data directly from PubMed using their API.", style="color: gray;") + with ui.layout_column_wrap(): + with ui.card(): + ui.input_text("pubmed_query", "Enter PubMed search query (e.g. 'cancer or genomics...')") + with ui.card(): + ui.input_numeric(id="pubmed_works_per_page", label="Select the amounts of works per page: ", min=1, max=100, value=works_per_page.get()) + + @render.text + def pubmed_works_per_page_value(): + value = input.pubmed_works_per_page() + if value < 1 or value > 100 or not isinstance(value, int): + return 'Invalid number. Please choose an integer between 1 and 100' + + works_per_page.set(value) + return f'Selected value: {value}' + + ui.input_action_button("handle_pubmed_search", "Search PubMed", class_="btn-primary") + + elif input.api_select() == "openalex": + is_pubmed.set(False) + ui.h4("OpenAlex Search", style="color: #5567BB; margin-top: 20px;") + ui.p("Import data directly from OpenAlex using their API.", style="color: gray;") + with ui.layout_column_wrap(): + with ui.card(): + ui.input_text("openalex_query", "Enter OpenAlex search query (e.g. 'cancer or genomics...')") + + with ui.card(): + ui.input_numeric(id="openalex_works_per_page", label="Select the amounts of works per page: ", min=1, max=100, value=works_per_page.get()) + + @render.text + def openalex_works_per_page_value(): + value = input.openalex_works_per_page() + if value < 1 or value > 100 or not isinstance(value, int): + return 'Invalid number. Please choose an integer between 1 and 100' + + works_per_page.set(value) + return f'Selected value: {value}' + + ui.input_action_button("handle_openalex_search", "Search OpenAlex", class_="btn-primary") + + results = search_results() + if results is not None: + max_pages = max_number_pages() + page = current_page() + page_limit_value = page_limit.get() + + ui.h4(f"Search Results for '{current_query.get()}':", style="color: #5567BB; margin-top: 20px;") + ui.h5(f"Total Number of pages: {max_pages} | Works per page: {works_per_page.get()}", style="color: gray; margin-bottom: 20px;") + ui.h5(f"Current page: {page}", style="color: gray; margin-bottom: 20px;") + + with ui.layout_column_wrap(): + with ui.card(): # (optional) navigating to the page + ui.input_numeric('page_number', f"Go to page: (Max {page_limit_value})", min=1, max=page_limit_value, value=int(page)) + + @render.text + def page_number_value(): + value = input.page_number() + if not isinstance(value, int) or value < 1 or value > page_limit_value: + go_to_page_button_flag.set(True) + return f'Invalid number. Please choose an integer between 1 and {page_limit_value}' + + go_to_page_button_flag.set(False) + return f'Selected value: {value}' + + #ui.input_text("page_number", f"Go to page: (Max {page_limit.get()})", value=str(page)) + ui.input_action_button("go_to_page", "Go", class_="btn-secondary", disabled=go_to_page_button_flag.get()) + + with ui.card(): + ui.input_action_button("prev_page", "← Previous", disabled=(page <= 1)) + ui.input_action_button("next_page", "Next →", disabled=(page >= page_limit.get())) + + @render.data_frame + def results_df(): + df = search_results() + if df is None: + return render.DataGrid( + pd.DataFrame()) + + results_show = df.copy() + + return render.DataGrid( + results_show, + filters=True, + width="100%", + summary=False, + styles=[ + { + "style": { + "vertical-align": "top", + "width": "150px", + "min-width": "150px", + "max-width": "150px", + "overflow": "hidden", + "text-overflow": "ellipsis", + "white-space": "nowrap", + } + } + ] + ) + + ui.h4("Start Analysis", style="color: #5567BB; margin-top: 30px;") + ui.input_action_button("api_start_button", "Start Analysis", icon=ICONS["play"]) + ui.input_action_button("api_save_results_to_file", "Save current page results", icon=ICONS["save"]) + + + @reactive.effect + @reactive.event(input.handle_pubmed_search) + def _handle_pubmed_search(): + is_pubmed.set(True) + try: + query = input.pubmed_query() + # count = input.pubmed_count() + + if not query or query.strip() == "": + ui.notification_show("Please enter a search query", type="error") + return + + current_query.set(query) + + ui.notification_show(f"Searching PubMed for: {query}") + query_result = search_pubmed(query) + retmax = works_per_page.get() + max_pages, webEnv, query_key = get_data_from_query(query_result, retmax) + print(f"Max pages: {max_pages}, WebEnv: {webEnv}") + max_number_pages.set(max_pages) + web_env_store.set(webEnv) + query_key_store.set(query_key) + current_page.set(1) + + computed_limit = compute_page_limit() + page_limit.set(computed_limit) + + first_page = get_paginated_papers_df(webEnv, query_key, page=0, retmax = retmax) + search_results.set(first_page) + + + except Exception as e: + logging.exception('pubmed search error') + ui.notification_show(f"Error: {str(e)}", type="error") + + def compute_page_limit(): + computed_limit = max_number_pages.get() + if max_number_pages.get() >= PAGINATION_LIMIT: + computed_limit = PAGINATION_LIMIT // works_per_page.get() + return computed_limit + + + @reactive.effect + @reactive.event(input.handle_openalex_search) + def _handle_openalex_search(): + is_pubmed.set(False) + try: + query = input.openalex_query().lower().strip() + if not query or query.strip() == "": + ui.notification_show("Please enter a search query", type="error") + return + + current_query.set(query) + + ui.notification_show(f"Searching OpenAlex for: {query}") + + #First search, get metadata to update max_pages and display first page + query_result = search_open_alex(query_str=query, works_per_page=works_per_page.get()) + max_number_pages.set( + get_max_pages(query_result) + ) + page_limit.set(PAGINATION_LIMIT // works_per_page.get()) + current_page.set(1) + + first_page = transform_to_df(query_result) + search_results.set(first_page) + + + # max_pages = get_data_from_query_open_alex(query) + # max_number_pages.set(max_pages) + + #print("Max pages for OpenAlex query:", max_pages) + #current_page.set(1) + + # all_df = search_open_alex(query, per_page=None) + # full_results.set(all_df) + + # first_page = paginated_papers_open_alex(query, page=1) + # search_results.set(first_page) + + except Exception as e: + ui.notification_show(f"Error: {str(e)}", type="error") + + @reactive.effect + @reactive.event(input.prev_page) + def _go_prev(): + page = current_page.get() + if page > 1: + new = page - 1 + current_page.set(new) + if is_pubmed.get(): + web_env = web_env_store.get() + query_key = query_key_store.get() + retmax = works_per_page.get() + search_results.set(get_paginated_papers_df(web_env, query_key, page=new, retmax=retmax)) + else: + query_results = search_open_alex(query_str=input.openalex_query(), works_per_page=works_per_page.get(), current_page=new) + search_results.set( + transform_to_df(query_results) + ) + + @reactive.effect + @reactive.event(input.next_page) + def _go_next(): + page = current_page.get() + max_pages = max_number_pages.get() + if page < max_pages - 1: + new = page + 1 + current_page.set(new) + if is_pubmed.get(): + web_env = web_env_store.get() + query_key = query_key_store.get() + retmax = works_per_page.get() + search_results.set(get_paginated_papers_df(web_env, query_key, page=new, retmax=retmax)) + else: + query_results = search_open_alex(query_str=input.openalex_query(), works_per_page=works_per_page.get(), current_page=new) + search_results.set( + transform_to_df(query_results) + ) + + @reactive.effect + @reactive.event(input.go_to_page) + def _go_to_page(): + val = input.page_number() + + if isinstance(val, int): + + current_page.set(val) + + if is_pubmed.get(): + web_env = web_env_store.get() + query_key = query_key_store.get() + retmax = works_per_page.get() + search_results.set(get_paginated_papers_df(web_env, query_key, page=val, retmax=retmax)) + else: + query_results = search_open_alex(query_str=input.openalex_query(), works_per_page=works_per_page.get(), current_page=val) + search_results.set( + transform_to_df(query_results) + ) + + + @reactive.effect + @reactive.event(input.api_start_button) + def _api_start(): + logging.debug('App.py - Invoked "_api_start()"') + api_source.set(input.api_select()) + df.set(search_results.get()) + print(df.get()) + + # Trigger for opening analysis page + if start_trigger.get() == 0: + start_trigger.set(1) + + ui.notification_show("Analysis done. See Main Information tab from the Overview side bar.") + + ui.update_navs(id='hidden_tabs', selected='import') + + @reactive.effect + @reactive.event(input.api_save_results_to_file) + def api_save_results_to_file(): + logging.debug('Invoked "api_save_results_to_file()"') + df.set(search_results.get()) + logging.debug(f'Results to save:\n{df.get()}\n\n') + + # for col, dtype in df.get().dtypes.items(): + # logging.debug(f"Column: {col:<20} Type: {dtype}") + + query_text = current_query.get() + if isinstance(query_text, str) and query_text.strip() != "": + status = save_api_results_to_file(folder_path=r'.\data', file_name=f'{query_text} search results.pqt', df_to_save=df.get()) + logging.debug(f'app.py - status value: "{status}"') + + if not status: + ui.notification_show("Something went wrong .", type='error') + else: + ui.notification_show(f'Local file updated: "{query_text} search results.pqt"') + with ui.nav_panel("None", value="collections"): ui.h3("🚧 Warning: Merge Collection is under construction 🚧") @@ -8185,8 +8605,10 @@ def update_plot_settings(): # --- Sidebar Management --- @render.express() -@reactive.event(input.start_button) def toggle_sidebar(): + trigger = start_trigger.get() + if trigger == 0: + return with ui.tags.div(id="sidebar_2", class_="custom-sidebar"): with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False): # Info Section diff --git a/functions/__init__.py b/functions/__init__.py index 20e24de36..f0107e8c5 100644 --- a/functions/__init__.py +++ b/functions/__init__.py @@ -40,4 +40,6 @@ from .get_thematicevolution import * from .get_cocitation import * from .get_collaborationnetwork import * -from .get_worldmapcollaboration import * \ No newline at end of file +from .get_worldmapcollaboration import * +from .save_api_results_to_file import * +from .load_df_from_parquet import * \ No newline at end of file diff --git a/functions/get_database.py b/functions/get_database.py index 5c5d4edc5..7140ee96c 100644 --- a/functions/get_database.py +++ b/functions/get_database.py @@ -1,3 +1,4 @@ +import logging from www.services import * @@ -11,27 +12,41 @@ def get_database(input): Returns: A string representing the name of the database. """ - if input.select() == "1A": # Bibliographic databases + database = '' + try: + if input.select() == "1A": # Bibliographic databases + + database = input.database() + + if database == "wos": + database = "Web of Science" + elif database == "scopus": + database = "Scopus" + elif database == "dimensions": + database = "Dimensions" + elif database == "lens": + database = "Lens.org" + elif database == "pubmed": + database = "PubMed" + elif database == "cochrane": + database = "Cochrane Library" - database = input.database() + elif input.select() == "1B": # Bibliometrix database + database = "Bibliometrix" - if database == "wos": - database = "Web of Science" - elif database == "scopus": - database = "Scopus" - elif database == "dimensions": - database = "Dimensions" - elif database == "lens": - database = "Lens.org" - elif database == "pubmed": - database = "PubMed" - elif database == "cochrane": - database = "Cochrane Library" - - elif input.select() == "1B": # Bibliometrix database - database = "Bibliometrix" - - elif input.select() == "1C": # Sample database - database = "Sample" - + elif input.select() == "1C": # Sample database + database = "Sample" + + elif input.select() == "1E": + database = "Local parquet file" + + elif input.api_select() in ['pubmed', 'openalex']: + database_name = input.api_select() + if database_name == 'pubmed': + database = 'PubMed' + if database_name == 'openalex': + database = 'OpenAlex' + except Exception as e: + logging.error(f'Error: \n{e}\n\n') + return database diff --git a/functions/get_localcitedsources.py b/functions/get_localcitedsources.py index 74b261455..f188724a9 100644 --- a/functions/get_localcitedsources.py +++ b/functions/get_localcitedsources.py @@ -1,5 +1,7 @@ from www.services import * - +import numpy as np +import logging +import pandas as pd def get_local_cited_sources(df, num_of_cited_sources): """ @@ -99,6 +101,9 @@ def wrap_label(label, width=50): # Set x-axis ticks to 0, 50, 100, etc. max_x = source_counts["N. of Local Citations"].max() + logging.debug(f'get_localCitedSources.py - max_x value= "{max_x}", max_x type: "{type(max_x)}"') + if pd.isna(max_x): + max_x = 10 tick_step = 50 x_ticks = list(range(0, int(max_x) + tick_step, tick_step)) if x_ticks[-1] < max_x: diff --git a/functions/get_table.py b/functions/get_table.py index 75b9c91d8..34293996e 100644 --- a/functions/get_table.py +++ b/functions/get_table.py @@ -1,5 +1,6 @@ from www.services import * from functions.get_status import * +import logging # Function to create a Plotly table visualization for metadata completeness @@ -78,8 +79,11 @@ def get_table(database, df, dpi=300, filter=False, modal=True): Returns: A DataTable object if data is available, otherwise a message indicating no data. """ + logging.debug('get_table - Invoked "get_table()"') # Retrieve the data from the DataFrame data = df.get() + logging.debug(f'data: \n{data}') + table_html = "" fig = None @@ -125,8 +129,17 @@ def get_table(database, df, dpi=300, filter=False, modal=True): } # Count missing values (NaN), empty strings, and empty lists in each column - missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + ( - data.map(lambda x: x == [])).sum() + def is_empty_string(x): + # If it's a list, numpy array, or dictionary, it's not an empty string + if hasattr(x, '__len__') and not isinstance(x, (str, bytes)): + return False + return x in ["", " "] + + is_missing = data.isna() | data.map(is_empty_string) + missing_counts = is_missing.sum() + + # missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + ( + # data.map(lambda x: x == [])).sum() # Calculate the percentage of missing values for each column missing_percentage = (missing_counts / total_rows) * 100 @@ -149,6 +162,9 @@ def get_table(database, df, dpi=300, filter=False, modal=True): # Create and return the Plotly table fig = create_plotly_table(sorted_columns, dpi) + logging.debug('get_table - FIG OBJECT') + logging.debug(fig) + # HTML table header table_header = """ diff --git a/functions/load_df_from_parquet.py b/functions/load_df_from_parquet.py new file mode 100644 index 000000000..d32b3f34d --- /dev/null +++ b/functions/load_df_from_parquet.py @@ -0,0 +1,22 @@ +import pandas as pd +import logging + +def _clean_numpy_collections(df: pd.DataFrame) -> pd.DataFrame: + """Converts any embedded numpy arrays inside object columns back to native python lists.""" + import numpy as np + + # Target only 'object' columns which hold strings, lists, or arrays + object_cols = df.select_dtypes(include=['object']).columns + + for col in object_cols: + # Check if the column contains any numpy arrays + if df[col].apply(lambda x: isinstance(x, np.ndarray)).any(): + df[col] = df[col].apply(lambda x: list(x) if isinstance(x, np.ndarray) else x) + + return df + +def load_df_from_parquet(filepath): + df_local = pd.read_parquet(filepath) + df_local = _clean_numpy_collections(df_local) + logging.debug(f'load_df_from_parquet - {df_local}') + return df_local \ No newline at end of file diff --git a/functions/save_api_results_to_file.py b/functions/save_api_results_to_file.py new file mode 100644 index 000000000..541eeea4b --- /dev/null +++ b/functions/save_api_results_to_file.py @@ -0,0 +1,71 @@ +from pathlib import Path +import pandas as pd +import logging + +def save_api_results_to_file(folder_path: str, file_name: str, df_to_save: pd.DataFrame) -> bool: + """ + Function to save the results of an API call to local storage. The saved files are grouped by "query name". + + Returns: + bool: True if ok, False if error + """ + if not isinstance(folder_path, str): + logging.error(f'Arg "folder_path" is of type "{type(folder_path)}", expected a str') + return False + if not isinstance(file_name, str): + logging.error(f'Arg "file_name" is of type "{type(file_name)}", expected a str') + return False + if not isinstance(df_to_save, pd.DataFrame): + logging.error(f'Arg "df_to_save" is of type "{type(df_to_save)}", expected a pd.DataFrame') + return False + + logging.debug(df_to_save.info(verbose=True)) + + dir_path = Path(folder_path) + dir_path.mkdir(parents=True, exist_ok=True) + + file_path = dir_path / file_name + + if not file_path.exists(): + logging.warning(f"File not found. Initializing a new dataset for {file_path}...") + df_local = df_to_save.iloc[0:0].copy() + else: + try: + df_local = pd.read_parquet(file_path) + logging.debug(df_local.info(verbose=True)) + except Exception as e: + logging.exception(f"Error reading the file: {file_path}") + return False + + + if set(df_local.columns) == set(df_to_save.columns): + logging.info("Structures match. Checking for new rows...") + + combined = pd.concat([df_local, df_to_save], ignore_index=True) + + merged = combined.drop_duplicates(subset=["UT"], keep="last") + + new_rows_count = len(merged) - len(df_local) + + logging.debug(merged.info(verbose=True)) + logging.debug(merged) + logging.debug(new_rows_count) + + if new_rows_count > 0: + try: + merged.to_parquet(file_path) + logging.info(f"Successfully added {new_rows_count} new rows to the file.") + except Exception as e: + logging.error('Something went wrong. Did not update file') + logging.error(str(e)) + return False + else: + logging.info("No new rows to add. File is up to date.") + + else: + logging.error( + "Structure mismatch! The file structure does not match the DataFrame in memory." + ) + return False + + return True \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d94f94d9fde545db192036336adda97dbb14fb42..94705efb0073c7b53cb4b2be5f26822f46f2cd9b 100644 GIT binary patch delta 359 zcmbQF@kq=49HGAnB_E6Cx-n`>EL zGs&8O-JJ)tGZ$!PCDcTazw{W)Cbx2|W;EVx$jQkFbbb-gv~s8dkhj2QZr10T&cq3G zq2=U@Jo2m{Nu$YMdDbd|{80)tw*;s)9~d^pVC@!Q(?Qw|C(q_x%xJ#Zo==~V(Rgw` N|93{S&FKQG838RtL|y;@ delta 187 zcmaE%I!R?i7$dhSgC2tk5F0U=O-^JClr>`DW#D2+X2@a41d`bdwm@hBls5!o^U3EK z(v`o_K`M~ diff --git a/www/services/format_functions.py b/www/services/format_functions.py index 1a8ee7af4..11f8ac762 100644 --- a/www/services/format_functions.py +++ b/www/services/format_functions.py @@ -523,7 +523,11 @@ def format_di_column(entry, source, file_type): # Function for DI Column doi = entry.get('DI', [''])[0] elif source == 'PubMed': if file_type == '.txt': - doi = entry.get('LID', '') + doi_pattern = r'(.+?)\s*\[doi\]$' + doi_raw = entry.get('LID', '') + for doi_value in doi_raw.split(";"): + if re.match(doi_pattern, doi_value): + return doi_value.replace('[doi]', '').strip() elif source == 'Scopus': if file_type == '.bib': doi = entry.get('doi', '') @@ -993,6 +997,8 @@ def format_py_column(entry, source, file_type): # Function for PY Column if file_type == '.txt': publication_year = entry.get('DP', '') publication_year = re.findall(r'\d{4}', publication_year)[0] if publication_year else '' + if publication_year != '': + publication_year = int(publication_year) if publication_year.isdigit() else 0 elif source == 'Scopus': if file_type == '.bib': publication_year = str(entry.get('year', '')) @@ -1627,11 +1633,11 @@ def process_single_file(data, source, file_type, author): if column not in entry_data: # Avoid overwriting existing keys entry_data[column] = entry.get(column, None) - # Remove the column based on the value of the 'author' field - if author == "surname": - entry_data.pop('AF', None) # Remove 'AF' if it exists - elif author == "fullname": - entry_data.pop('AU', None) # Remove 'AU' if it exists + # # Remove the column based on the value of the 'author' field + # if author == "surname": + # entry_data.pop('AF', None) # Remove 'AF' if it exists + # elif author == "fullname": + # entry_data.pop('AU', None) # Remove 'AU' if it exists entries.append(entry_data) diff --git a/www/services/openalex/__init__.py b/www/services/openalex/__init__.py new file mode 100644 index 000000000..78883939f --- /dev/null +++ b/www/services/openalex/__init__.py @@ -0,0 +1,6 @@ +import nltk +import logging + +logging.basicConfig(level=logging.DEBUG) +logging.getLogger('parser').setLevel(logging.WARNING) +nltk.download('wordnet') \ No newline at end of file diff --git a/www/services/openalex/api_service.py b/www/services/openalex/api_service.py new file mode 100644 index 000000000..6a2d7ea1e --- /dev/null +++ b/www/services/openalex/api_service.py @@ -0,0 +1,46 @@ +import requests +from .utils import * +import pandas as pd +from .parser import transform_from_open_alex + +OPEN_ALEX_KEY = "wi0R0MWb5Dy1mtZv0OMMn5" +OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?api_key={OPEN_ALEX_KEY}&search=" +#OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?search=" + +def search_open_alex(query_str: str, current_page:int =1, works_per_page:int =20) -> list[dict]: + """Contact the openalex API to retrieve information about the desired topic in JSON format + + Args: + query_str (str): Openalex Query Search Parameter + current_page (int): Current page + works_per_page (int): Number of works per page + + Returns: + list[dict]: Openalex results organized in a list of dicts, ready to be transformed in a DataFrame + """ + if not isinstance(query_str, str): + raise ValueError(f'Expecting a str for "query_str", got "{type(query_str)}"') + if not isinstance(current_page, int) or current_page<=0: + raise ValueError(f'Expected a positive int for "current_page", got "{type(current_page)}"') + if not isinstance(works_per_page, int) or works_per_page<=0: + raise ValueError(f'Expected a positive int for "works_per_page", got "{type(works_per_page)}"') + + full_request = f"{OPEN_ALEX_ENDPOINT}{query_str}&per_page={works_per_page}&page={current_page}" + + try: + response = requests.get(full_request) + response.raise_for_status() + data = response.json() + return data + except Exception as e: + raise e + +def test(): + query = "multiverse" + data = search_open_alex(query) + results = get_results_open_alex(data) + df = pd.DataFrame(results) + + +# if __name__ == "__main__": +# test() \ No newline at end of file diff --git a/www/services/openalex/pagination.py b/www/services/openalex/pagination.py new file mode 100644 index 000000000..55498d622 --- /dev/null +++ b/www/services/openalex/pagination.py @@ -0,0 +1,80 @@ +from .api_service import search_open_alex +from .parser import transform_from_open_alex +from .utils import * +import pandas as pd +import logging + +def get_max_pages(open_alex_json: dict[dict]) -> int | str: + """Calculate the total number of pages available for a specific topic + + Args: + open_alex_json (list[dict]): Openalex json file from which to extract metadata information + + Results: + int | str: The maximum number of pages for a topic. If metadata is not available will return str""" + + logging.debug('Invoked get_max_pages\n') + logging.debug(f'Arg: open_alex_json. Arg type: {type(open_alex_json)}\n') + + if not isinstance(open_alex_json, dict): + logging.error(f'Expected a dict for arg "open_alex_json", got "{type(open_alex_json)}".') + return "Metadata unavailable" + + metadata = open_alex_json.get('meta') + logging.debug(f'Metadata:\n{metadata}') + + if not metadata: + logging.error('No metadata available.') + return "Metadata unavailable" + + works_count = metadata.get('count') + works_per_page = metadata.get('per_page') + + if not works_count or not works_per_page: + logging.error(f'Malformed metadata.') + return "Metadata unavailable" + + if not isinstance(works_count, int) or not isinstance(works_per_page, int): + logging.error(f'Expected int for "works_count" and "works_per_page", got "{type(works_count)}" and "{type(works_per_page)}".') + return "Metadata unavailable" + + return works_count//works_per_page + + +def transform_to_df(open_alex_json: dict[dict]) -> pd.DataFrame: + """Transform an openalex json to a pandas dataframe + + Args: + open_alex_json (list[dict]): Openalex json file to convert to DataFrame + + Results: + pd.DataFrame: OPenalex query results converted to pandas DataFrame + """ + if not isinstance(open_alex_json, dict): + raise ValueError(f'Expected a dict for arg "open_alex_json", got "{type(open_alex_json)}".') + + json_results = open_alex_json.get('results') + + if not isinstance(json_results, list): + raise ValueError(f'Expected a list for field "results" in JSON. Got "{type(json_results)}"') + + return transform_from_open_alex(pd.DataFrame(json_results)) + +# def get_data_from_query_open_alex(query): +# data = search_open_alex(query) +# total_results = get_count_open_alex(data) + +# max_pages = (total_results // 20) +# return max_pages + +# def paginated_papers_open_alex(query, page): +# data = search_open_alex(query, page=page) +# results_json = get_results_open_alex(data) +# results_df = pd.DataFrame(results_json) + +# return transform_from_open_alex(results_df) + + + + + \ No newline at end of file diff --git a/www/services/openalex/parser.py b/www/services/openalex/parser.py new file mode 100644 index 000000000..9ff5e413d --- /dev/null +++ b/www/services/openalex/parser.py @@ -0,0 +1,493 @@ +import pandas as pd +import json +import numpy as np +from iso4 import abbreviate +import nltk +import logging +from itertools import chain + + +def _fetch_value(data: pd.Series, keys:list[str], is_return_list=False): + '''Retrieve a value for the specified key-path in a pandas' Series object. + This is intended to be used on a JSON object converted to DataFrame object. + + Args: + data (pd.Series): Pandas' Series object from which to retrieve the information + keys (list[str]): Ordered keys from top level to desired level. Eg. ['Top-level', 'First-nested level', 'Second-nested level', ...] + is_return_list (Bool): Specify if empty return must be a list or a str + + Returns: + Any | [] | "": Nested value associated with the specified key or empty list/str for invalid results''' + + if not isinstance(data, pd.Series): + raise ValueError(f'Data is of type {type(data)}. Must be pandas.Series!') + + if not isinstance(keys, list): + raise ValueError(f'Keys is of type {type(keys)}. Must be list of strings!') + + for key in keys: + if not isinstance(key, str): + raise ValueError(f'Key: {key} from keys: {keys} is of type: {type(key)}. Must be string!') + + empty = [] if is_return_list else "" + value = data.to_dict() + + for key in keys: + if not isinstance(value, dict): + logging.warning( + f'Expected a dict, got {type(value)} for key "{key}" in keys "{keys}" for data:\n{data}' + f'Returning empty {"list" if is_return_list else "string"}.' + ) + return empty + + value = value.get(key) + + if value is None or (not isinstance(value, (list, dict)) and pd.isna(value)): + logging.warning( + f'Value is missing or NaN for key "{key}" in keys: "{keys}" in data:\n{data}' + f'Returning empty {"list" if is_return_list else "string"}.' + ) + return empty + + return value + + +def _format_author_name(display_name: str) -> str: + """Convert 'Firstname [Middlename] Lastname' to 'Lastname FI' format. + + Args: + display_name (str): Author full-name to convert + + Returns: + str: Author name converted to 'Lastname FI' format + """ + if not isinstance(display_name, str): + raise ValueError(f'Expected string for display_name "{display_name}". Got {type(display_name)}') + + if not display_name or not display_name.strip(): + logging.warning(f'Display_name "{display_name}" is empty. Returning "" ') + return "" + + parts = display_name.strip().split() + + if len(parts) == 1: + return parts[0] # single name, return as-is + + surname = parts[-1] + initials = "".join(f"{p[0]}." for p in parts[:-1]) + + return f"{surname} {initials}" + + +def _calculate_JI(name_to_abbreviate: str) -> str: + """Calculate the abbreviated form of the Journal name following ISO4 standard. + + Args: + name_to_abbreviate (str): Journal name to abbreviate + + Returns: + str: Abbreviated Journal name""" + logging.debug(f'Invoked _calculate_JI\nArg type:{type(name_to_abbreviate)}\nArg:\n{name_to_abbreviate}\n\n') + + if not isinstance(name_to_abbreviate, str): + raise ValueError(f'Expected string for name_to_abbreviate "{name_to_abbreviate}". Got {type(name_to_abbreviate)}') + + if not name_to_abbreviate or not name_to_abbreviate.strip(): + logging.warning(f'name_to_abbreviate "{name_to_abbreviate}" is empty. Returning "" ') + return "" + + try: + return(abbreviate(name_to_abbreviate)) + + except Exception as e: + logging.warning(f'WARNING! Got exception \n{e}\nReturning empty value') + return "" + + +def _calculate_AU_or_AF(authorship_list: list[dict], fullname=False) -> list[str]: + """Retrieve the names of the authors in either short or fullname format + + Args: + authorship_list (list[dict]): authorships field in the open_alex json response + fullname (bool): Flag for returning fullname or short name + + Returns: + list[str]: List containing the names of the authors of the work""" + logging.debug(f'Invoked _calculate_AU_or_AF\nArg type:{type(authorship_list)}\nArg:\n{authorship_list}\n\n') + + if not isinstance(authorship_list, list): + logging.warning(f'Expectinga list, (got {type(authorship_list)}). \nAuthorships_list: {authorship_list}\nReturning empty list') + return [] + + if not authorship_list or authorship_list is None: + logging.warning('Authorship_list is empty or None. Returning empty list') + return [] + + authors_list = [] + + for authorship in authorship_list: + if not isinstance(authorship, dict): + logging.warning(f'Expected a dict, got {type(authorship)}') + continue + + author = authorship.get('author') + if not isinstance(author, dict): + logging.warning(f'Expected a dict, got {type(author)}') + continue + + author_name = author.get('display_name') + if author_name is None or not author_name.strip(): + logging.warning(f'Missing display_name in authorship entry: {authorship}') + continue + + authors_list.append(_format_author_name(author_name) if not fullname else author_name) + + return authors_list + + +def _calculate_C1(authorships_list: list[dict]) -> list[str]: + """ + Calculate authors affiliations and returns them as a list of strings. + + Args: + authorships_list (list[dict]): OpenAlex's response Json's authorship field + + Returns: + list[str]: A list of the authors affiliations + """ + logging.debug(f'Invoked _calculate_C1\nArg type:{type(authorships_list)}\nArg:\n{authorships_list}\n') + + if not isinstance(authorships_list, list): + logging.warning(f'Expected a list, got {type(authorships_list)}: {authorships_list}\nReturning empty list\n') + return [] + + if not authorships_list: + logging.warning(f'Empty authorships list. Returning empty list\n') + return [] + + affiliations_set = set() + for authorship in authorships_list: + if not isinstance(authorship, dict): + logging.warning(f'Expected a dict, got "{type(authorship)}"') + continue + + raw_affiliation_strings = authorship.get('raw_affiliation_strings') + if not isinstance(raw_affiliation_strings, list): + logging.warning( + f'Expected a list for raw_affiliation_strings, ' + f'got {type(raw_affiliation_strings)}: {authorship}\n' + ) + continue + + for affiliation_string in raw_affiliation_strings: + if not isinstance(affiliation_string, str): + logging.warning(f'Expected a str, got "{type(affiliation_string)}". Skipping...') + continue + affiliations_set.add(affiliation_string) + + return list(affiliations_set) + + +def _get_first_authorship(authorships_list: list[dict]) -> dict | None: + ''' + Helper function to get the first authorship of an authorship list if present. + + Args: + authorships_list (list[dict]): An authorships object from an open_alex.json search results file + + Returns: + dict | None: The authorship if present, otherwise None + ''' + + logging.debug(f'Invoked _get_first_authorship\nArg type:{type(authorships_list)}\nArg:\n{authorships_list}\n') + + if not isinstance(authorships_list, list): + logging.warning(f'Expected a list, got {type(authorships_list)}: {authorships_list}\nReturning None\n') + return None + + if not authorships_list: + logging.warning('Empty authorships list. Returning None\n') + return None + + first_authorship = None + for authorship in authorships_list: #Loop to find first author + + if not isinstance(authorship, dict): + logging.warning(f'Expecting a dict, got "{type(authorship)}". Skipping...\n') + continue + + author_position = authorship.get('author_position') + if not isinstance(author_position, str): + logging.warning(f'Expected a str for author_position, got "{type(author_position)}". Skipping... ') + continue + + if author_position == 'first': + first_authorship = authorship + break + + if not first_authorship: + logging.warning(f'No first author found in {authorships_list}\nReturning None') + return None + + return first_authorship + + +def _calculate_RP(authorships_list: list[dict]) -> str: + ''' + Create the reprint address for the first author. + (CORRESPONDING AUTHOR) + If author is missing will return empty string. If affiliation is missing, will return + just the author. + + Args: + authorships_list (list[dict]): OpenAlex's Json response's authorship field + Returns: + "" | str: A string containing the first author's name and first affiliation address if present + ''' + logging.debug(f'Invoked _calculate_RP\nArg type:{type(authorships_list)}\nArg:\n{authorships_list}\n') + + if not isinstance(authorships_list, list): + logging.warning(f'Expected a list, got {type(authorships_list)}: {authorships_list}\nReturning empty str\n') + return "" + + if not authorships_list: + logging.warning(f'Empty authorships list. Returning empty str\n') + return "" + + first_authorship = _get_first_authorship(authorships_list) + + if not first_authorship: + logging.warning(f'No first authorship found! \n{authorships_list}\nReturning empty str') + return "" + + author_name = first_authorship.get('raw_author_name') + author_affiliations = first_authorship.get('raw_affiliation_strings') + + if not isinstance(author_name, str): + logging.warning( + 'Expected a str for raw_author_name, ' + f'got {type(author_name)}: {first_authorship}\n' + ) + return "" + + reprint_address = f'{_format_author_name(author_name)} (CORRESPONDING AUTHOR)' + + if not isinstance(author_affiliations, list) or not author_affiliations: + logging.warning(f'Expected a non-empty list, got "{type(author_affiliations)}". Returning incomplete reprint address "{reprint_address}"') + return reprint_address + + first_affiliation = author_affiliations[0] + if not isinstance(first_affiliation, str): + logging.warning(f'Expected a str, got "{type(first_affiliation)}". Returning incomplete reprint address "{reprint_address}"') + return reprint_address + + reprint_address += f" {first_affiliation}" + + return reprint_address + + +def _calculate_DE_and_ID(keyword_list: list[dict]) -> list[str]: + ''' + Extract the keywords from Open_alex's json response's keyword field and return them as a list of str + Args: + keyword_list (list[dict]): OpenAlex's keyword field + Returns: + list[str]: A list containing the work's keywords + ''' + logging.debug(f'Invoked _calculate_DE_and_ID\nArg type:{type(keyword_list)}\nArg:\n{keyword_list}\n') + + if not isinstance(keyword_list, list): + logging.warning(f'Expected a list, got {type(keyword_list)}: {keyword_list}\nReturning empty list\n') + return [] + + if not keyword_list: + logging.warning('Empty keywords list. Returning empty list\n') + return [] + + result = [] + for keyword_dict in keyword_list: + if not isinstance(keyword_dict, dict): + logging.warning(f'Expected a dict, got "{type(keyword_dict)}" Skipping...') + continue + + keyword = keyword_dict.get('display_name') + + if not isinstance(keyword, str) or not keyword.strip(): + logging.warning(f'Expected a non-empty str, got "{type(keyword)}" Skipping...') + continue + + result.append(keyword) + + + return result + + +def _calculate_AB(abstract_inverted_index: dict) -> str: + ''' + This function takes as input open_alex's json responses's "abstarct_inverted_index" field, which is a dict with words as keys + and index position as values. It returns the reconstructed sentence. + Args: + abstract_inverted_index (dict): A dictionary containing words as keys and their index postion as values + Returns: + str: A string of the reconstructed abstract + ''' + + logging.debug(f'Invoked _calculate_AB\nArg type:{type(abstract_inverted_index)}\nArg:\n{abstract_inverted_index}\n') + + if not isinstance(abstract_inverted_index, dict): + logging.warning(f'Expected a dict, got {type(abstract_inverted_index)}: {abstract_inverted_index}\nReturning empty str\n') + return "" + + if not abstract_inverted_index: + logging.warning(f'Dictionary is empty: {abstract_inverted_index}\nReturning empty str\n') + return "" + + try: + max_index = max(list(chain(*abstract_inverted_index.values()))) + abstract_template = [""] * (max_index+1) + + for word, index_list in abstract_inverted_index.items(): + for index in index_list: + abstract_template[index] = word + + abstract = " ".join(abstract_template) + except ValueError as e: + logging.warning(f'Unexpected error!\n\n {e} \n\nReturning empty str') + return "" + + return abstract + + +def _calculate_SR(first_authorship: dict, release_year: int, journal_name: str) -> str: + ''' + Calculate the Short Reference -> ", , " + If some parts are missing they will be excluded from the result + + Args: + first_authorship (dict): The first authorship dictionary + release_year (int): The release year of the work + journal_name (str): The full name of the journal + + Returns: + The calculated short reference as a str + ''' + logging.debug(f'Invoked _calculate_SR\nArg type:{type(first_authorship)}\nArg:\n{first_authorship}\n') + logging.debug(f'Arg type:{type(release_year)}\nArg:\n{release_year}\nArg type:{type(journal_name)}\nArg:\n{journal_name}\n') + + is_release_year_valid = True + is_journal_name_valid = True + + + # Correct type checks + if not isinstance(first_authorship, dict): + logging.warning(f'Expected a dict for first_authorship, got a {type(first_authorship)} -> {first_authorship}\nReturning empty str') + return "" + if not isinstance(release_year, int): + logging.warning(f'Expected an int for release_year, got a {type(release_year)} -> {release_year}\nOmitting it in the result.') + is_release_year_valid = False + if not isinstance(journal_name, str): + logging.warning(f'Expected a str for journal_name, got a {type(journal_name)} -> {journal_name}\nOmitting it in the result.') + is_journal_name_valid = False + + # Empty args check + if not first_authorship: + logging.warning('Empty first_authorship value. Returning empty str') + return "" + if not is_journal_name_valid or not journal_name: + logging.warning('Empty journal_name value. Omitting it in the result') + is_journal_name_valid = False + + # Fetch first author name + author_name = first_authorship.get('raw_author_name') + if not isinstance(author_name, str) or not author_name.strip(): + logging.warning(f'First author name could not be found! {first_authorship}\n Returning empty str') + return "" + author_name = _format_author_name(author_name) + + short_reference_template = [author_name] + if is_release_year_valid: + short_reference_template.append(str(release_year)) + if is_journal_name_valid: + short_reference_template.append(journal_name) + + short_reference = ", ".join(short_reference_template) + + return short_reference + + + +def transform_from_open_alex(input_df: pd.DataFrame) -> pd.DataFrame: + """Transformation function to convert an Open_alex JSON to the bibliometrix standard format. + + Args: + input_df (pd.DataFrame): Open_alex JSON results converted to Pandas DataFrame + + Returns: + pd.DataFrame: Bibliometrix standard format DataFrame""" + + logging.debug(f'Invoked "transform_from_open_alex".\n{input_df}\n\n') + + if not isinstance(input_df, pd.DataFrame): + raise ValueError(f'Expected a pd.DataFrame, got "{type(input_df)}"!') + + result = [] + + for index, row in input_df.iterrows(): + try: + authorships = _fetch_value(row, ['authorships']) + keywords = _fetch_value(row, ['keywords'], is_return_list=True) + + row_template = { + "DB": "open_alex", + "UT": _fetch_value(row, ["id"]), + "DI": _fetch_value(row, ["doi"]), + "PMID": _fetch_value(row, ["ids", "pmid"]), + "TI": _fetch_value(row, ["title"]), + "SO": _fetch_value(row, ['primary_location', 'source', 'display_name']), + "JI": _calculate_JI(_fetch_value(row, ['primary_location', 'source', 'display_name'])), + "PY": _fetch_value(row, ['publication_year']), + "DT": _fetch_value(row, ['type']), + "LA": _fetch_value(row, ['language']), + "TC": _fetch_value(row, ['cited_by_count']), + "AU": _calculate_AU_or_AF(authorships), + "AF": _calculate_AU_or_AF(authorships, fullname=True), + "C1": _calculate_C1(authorships), + "RP": _calculate_RP(authorships), + "CR": _fetch_value(row,[ 'referenced_works'], is_return_list=True), + "DE": _calculate_DE_and_ID(keywords), + "ID": _calculate_DE_and_ID(keywords), + "AB": _calculate_AB(_fetch_value(row, ['abstract_inverted_index'])), + "VL": _fetch_value(row, ['biblio', 'volume']), + "IS": _fetch_value(row, ['biblio', 'issue']), + "BP": _fetch_value(row, ['biblio', 'first_page']), + "EP": _fetch_value(row, ['biblio', 'last_page']), + "SR": _calculate_SR( + first_authorship=_get_first_authorship(authorships), + release_year=_fetch_value(row, ['publication_year']), + journal_name=_fetch_value(row, ['primary_location', 'source', 'display_name']) + ) + } + + #logging.debug(f'Row "#{index}":\n{row_template}\n\n') + + result.append(row_template) + + except Exception as e: + logging.warning(f'Omitting row "#{index}" due to exception:\n{e}\n\n') + + return pd.DataFrame(result) + +#nltk.download('wordnet') + +# def transform_to_df(data): +# return pd.DataFrame(data) + +# def _load_json_from_file(filename): +# with open(filename, 'r') as f: +# data = json.load(f) +# return data +# input_df = transform_to_df(_load_json_from_file('open_alex_motorcycle_results.json')) +# output_df = transform_from_open_alex(input_df) +# print(output_df.head(20)) +# with open('open_alex_result.csv', 'w', encoding='utf-8') as f: # Print to file to check +# f.write(output_df.to_csv()) \ No newline at end of file diff --git a/www/services/openalex/utils.py b/www/services/openalex/utils.py new file mode 100644 index 000000000..9d7a4e443 --- /dev/null +++ b/www/services/openalex/utils.py @@ -0,0 +1,8 @@ +def get_count_open_alex(data : dict) -> int: + return data["meta"]["count"] + +def get_page_open_alex(data : dict) -> str: + return data["meta"]["page"] + +def get_results_open_alex(data : dict) -> dict: + return data["results"] \ No newline at end of file diff --git a/www/services/parsers.py b/www/services/parsers.py index 72b9d370e..c65da921c 100644 --- a/www/services/parsers.py +++ b/www/services/parsers.py @@ -1,4 +1,4 @@ -from .utils import * +from www.services.utils import * #### WEB OF SCIENCE PARSER #### @@ -38,40 +38,39 @@ def parse_wos_data(datapath): # PARSER FOR WEB OF SCIENCE TXT and CIW return elem_data - #### PUBMED PARSER #### def parse_pubmed_data(datapath): # PARSER FOR PUBMED TXT - data = [] - current_record = {} - + data = [] with open(datapath, 'r', encoding='utf-8') as file: - lines = file.readlines() + file_data = file.read() + paper_begin_pattern = r'(?=PMID\s*-\s*\d+)' + papers = re.split(paper_begin_pattern, file_data) - for line in lines: - # line = line.decode('utf-8') # Decode the line from bytes to string - if line.strip() == '': - # If the line is empty, add the current record to the data - if current_record: - data.append(current_record) - current_record = {} - continue - - key_match = re.match(r'^([A-Z]+)\s*-\s*(.+)', line) - if key_match: - key = key_match.group(1) - value = key_match.group(2) - - if key in current_record: - current_record[key] += ';' + value + for paper in papers: + current_record = {} + lines = paper.split("\n") + for line in lines: + # line = line.decode('utf-8') # Decode the line from bytes to string + if line.strip() == '': + # If the line is empty, skip it + continue + + key_match = re.match(r'^([A-Z]+)\s*-\s*(.+)', line) + if key_match: + key = key_match.group(1) + value = key_match.group(2) + + if key in current_record: + current_record[key] += ';' + value + else: + current_record[key] = value else: - current_record[key] = value - else: - # Add the content to the previous key - current_record[key] += ' ' + line.strip() + # Add the content to the previous key + current_record[key] += ' ' + line.strip() - # Add the last record if present - if current_record: - data.append(current_record) + # Add the last record if present + if current_record: + data.append(current_record) return data diff --git a/www/services/pubmed_api/api_service.py b/www/services/pubmed_api/api_service.py new file mode 100644 index 000000000..33f0f4b56 --- /dev/null +++ b/www/services/pubmed_api/api_service.py @@ -0,0 +1,46 @@ +import requests +import pandas as pd + +PUBMED_SEARCH_ENDPOINT = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" +PUBMED_FETCH_ENDPOINT = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + +def search_pubmed(query, retmax=20): + params = { + "db": "pubmed", + "retmax": retmax, + "term": query, + "usehistory": "y" + } + response = requests.get(PUBMED_SEARCH_ENDPOINT, params=params) + response.raise_for_status() + data = response.text + return data + +def search_webenv(webenvId, query_key, restart=0, retmax=20): + params = { + "db": "pubmed", + "query_key": query_key, + "WebEnv": webenvId, + "retstart": restart, + "retmax": retmax + } + response = requests.get(PUBMED_SEARCH_ENDPOINT, params=params) + response.raise_for_status() + data = response.text + return data + + +def fetch_pubmed(webEnv, query_key, restart=0, retmax=100): + params = { + "db": "pubmed", + "query_key": query_key, + "WebEnv": webEnv, + "retmode": "text", + "rettype": "medline", + "retstart": restart, + "retmax": retmax + } + response = requests.get(PUBMED_FETCH_ENDPOINT, params=params) + response.raise_for_status() + data = response.text + return data \ No newline at end of file diff --git a/www/services/pubmed_api/pagination.py b/www/services/pubmed_api/pagination.py new file mode 100644 index 000000000..37a47f756 --- /dev/null +++ b/www/services/pubmed_api/pagination.py @@ -0,0 +1,30 @@ +from .api_service import fetch_pubmed +import os +import pandas as pd +from www.services.format_functions import process_single_file + +# Required columns +DF_COLUMNS = ["DB", "UT", "DI", "PMID", "TI", "SO", "JI", "PY", "DT", "LA", "TC", "AU", "AF", "C1", "RP", "CR", "DE", "ID", "AB", "VL", "IS", "BP", "EP", "SR"] + +def create_df_from_pubmed_data(papers_dict_list): + def _remove_uneeded_fields(old_dict): + return {k: old_dict[k] for k in DF_COLUMNS if k in old_dict} + + list_of_dicts = [_remove_uneeded_fields(paper) for paper in papers_dict_list] + return pd.DataFrame(list_of_dicts) + +# Parsing papers data using the library's parser +def _get_bibliometrix_parsed_data_file(papers_data): + with open("pubmed.txt", "w", encoding='utf-8') as f: + f.write(str(papers_data)) + return process_single_file(os.path.abspath(f.name), "pubmed", ".txt", "fullname") + +def get_paginated_papers_df(webEnv, query_key, page=0, retmax=20): + if page > 0: + page -= 1 # pubmed starts from 0 index + restart = page * retmax + papers_data = fetch_pubmed(webEnv, query_key, restart=restart, retmax=retmax) + papers_dict_list = _get_bibliometrix_parsed_data_file(papers_data) + with open("biblio_parsed.txt", "w") as ff: + ff.write(str(papers_dict_list)) + return create_df_from_pubmed_data(papers_dict_list) \ No newline at end of file diff --git a/www/services/pubmed_api/xml_parser.py b/www/services/pubmed_api/xml_parser.py new file mode 100644 index 000000000..6e08b2c29 --- /dev/null +++ b/www/services/pubmed_api/xml_parser.py @@ -0,0 +1,30 @@ +import xml.etree.ElementTree as ET + +def _get_ids_from_xml(xml_data): + tree = ET.fromstring(xml_data) + tree = tree.find("IdList") + return [id_elem.text for id_elem in tree.findall("Id")] + +def _get_count(xml_data): + tree = ET.fromstring(xml_data) + tree = tree.find("Count") + return int(tree.text) + +def _get_webenv(xml_data): + tree = ET.fromstring(xml_data) + tree = tree.find("WebEnv") + return tree.text + +def _get_query_key(xml_data): + tree = ET.fromstring(xml_data) + tree = tree.find("QueryKey") + return tree.text + + +def get_data_from_query(xml_data, retmax): + count = _get_count(xml_data) + number_pages = (count // retmax) + 1 if count % retmax != 0 else (count // retmax) + webEnv = _get_webenv(xml_data) + query_key = _get_query_key(xml_data) + + return number_pages, webEnv, query_key