From 547d07cd1919d184c747dc6b2bd0311857e46055 Mon Sep 17 00:00:00 2001
From: nixxdd <stefaniarosci2007@gmail.com>
Date: Wed, 17 Jun 2026 20:23:52 +0200
Subject: [PATCH] Added Pubmed API service and OpenAlex API service to the
 project. Adding them to the interface

---
 .gitignore                             |   9 +-
 app.py                                 | 480 ++++++++++++++++++++++--
 functions/__init__.py                  |   4 +-
 functions/get_database.py              |  57 +--
 functions/get_localcitedsources.py     |   7 +-
 functions/get_table.py                 |  20 +-
 functions/load_df_from_parquet.py      |  22 ++
 functions/save_api_results_to_file.py  |  71 ++++
 requirements.txt                       | Bin 4626 -> 4840 bytes
 www/services/format_functions.py       |  18 +-
 www/services/openalex/__init__.py      |   6 +
 www/services/openalex/api_service.py   |  46 +++
 www/services/openalex/pagination.py    |  80 ++++
 www/services/openalex/parser.py        | 493 +++++++++++++++++++++++++
 www/services/openalex/utils.py         |   8 +
 www/services/parsers.py                |  57 ++-
 www/services/pubmed_api/api_service.py |  46 +++
 www/services/pubmed_api/pagination.py  |  30 ++
 www/services/pubmed_api/xml_parser.py  |  30 ++
 19 files changed, 1394 insertions(+), 90 deletions(-)
 create mode 100644 functions/load_df_from_parquet.py
 create mode 100644 functions/save_api_results_to_file.py
 create mode 100644 www/services/openalex/__init__.py
 create mode 100644 www/services/openalex/api_service.py
 create mode 100644 www/services/openalex/pagination.py
 create mode 100644 www/services/openalex/parser.py
 create mode 100644 www/services/openalex/utils.py
 create mode 100644 www/services/pubmed_api/api_service.py
 create mode 100644 www/services/pubmed_api/pagination.py
 create mode 100644 www/services/pubmed_api/xml_parser.py

diff --git a/.gitignore b/.gitignore
index 23b99e089..b7e889daa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,11 @@
+.DS_Store
 __pycache__/
+biblio_parsed.txt
+pubmed.txt
+pubmed_data.csv
+pubmed_data.csv
 bibliovenv/
 Bibenv/
-.idea/
\ No newline at end of file
+.idea/
+.venv
+data/
diff --git a/app.py b/app.py
index f0891f894..350da386c 100644
--- a/app.py
+++ b/app.py
@@ -45,13 +45,13 @@
 # -----
 # Author: PRAISELab Team
 
-
 # Import necessary libraries for better performance - avoid importing everything
 import tempfile
 import os
 import requests
 import functools
 from datetime import datetime
+import traceback
 import pandas as pd
 import io
 from functions import *
@@ -64,6 +64,33 @@
 from shinywidgets import render_widget
 from shiny.express import ui, input, render
 
+## pubmed API imports
+from www.services.pubmed_api.api_service import search_pubmed, fetch_pubmed
+from www.services.pubmed_api.pagination import create_df_from_pubmed_data, get_paginated_papers_df
+from www.services.pubmed_api.xml_parser import get_data_from_query
+
+## openalex API imports
+from www.services.openalex.api_service import search_open_alex
+from www.services.openalex.parser import transform_from_open_alex
+from www.services.openalex.pagination import *
+from www.services.openalex.utils import *
+
+# JS Snippet to fix caching issue auto loading the last value of "select"
+ui.tags.script("""
+document.addEventListener('DOMContentLoaded', function() {
+    var sel = document.getElementById('select');
+    if (sel) {
+        sel.setAttribute('autocomplete', 'off');
+        // Force back to the intended default on every fresh load
+        sel.value = '0';
+        // Notify Shiny's reactive system the value changed (in case it was already '0' silently)
+        var event = new Event('change', { bubbles: true });
+        sel.dispatchEvent(event);
+    }
+});
+""")
+
+
 # Setup the Directory for static assets - optimized for performance
 base_dir = tempfile.gettempdir()  # Use system temp dir instead of creating new temp file
 express.app_opts(static_assets=base_dir, debug=False)
@@ -83,6 +110,11 @@
 # Include custom CSS for the app's appearance.
 ui.include_css("www/static/biblioshiny.css")
 
+
+start_trigger = reactive.Value(0)
+api_source = reactive.Value(None)
+df = reactive.Value(None)
+
 # --- Header ---
 # The header bar contains the logo, app name, and a set of dropdown menus for notifications, help, donations, and credits.
 with ui.tags.div(class_="header-bar"):
@@ -575,7 +607,6 @@ def get_latest_cran_version():
             ui.h3("📊 Data Management", style="color: #5567BB;")
             ui.p("Easily import, load, or export your dataset.")
             # ---------- INITIALIZE VARIABLES ----------
-            df = reactive.Value(None)
             
             # Optimized function to reset analysis results when dataset changes
             def reset_all_analyses():
@@ -641,17 +672,20 @@ def create_loading_modal(analysis_type="analysis"):
             with ui.layout_sidebar(fillable=False, fill=False):
                 # Sidebar for data import options
                 with ui.sidebar(id="sidebar_load_data", position="right" ):
-                    # Section for Import or Load
+                    # Section for Import or Load    
                     ui.h5("Data Import Options", style="color: #5567BB;")
                     ui.input_select(
-                        "select",
-                        "Choose an action:",
-                        {
-                            "": "-",
+                        id= "select",
+                        label="Choose an action:",
+                        choices={
+                            "0": "-",
                             "1A": "Import raw data file(s)",
                             "1B": "Load Bibliometrix file(s)",
-                            "1C": "Use a sample dataset"
+                            "1C": "Use a sample dataset",
+                            "1D": "API Import",
+                            "1E": "Load locally saved API results"
                         },
+                        selected="0"
                     )
 
                     @render.express()
@@ -711,7 +745,19 @@ def select_db():
                             ui.input_action_button("start_button", "Start", icon=ICONS["play"])
                             ui.markdown("Select a predefined sample dataset for testing purposes.")
 
-                        else:
+                        elif input.select() == "1D":
+                            ui.p("Redirecting to the API Import tab.", style="color: gray;")
+
+                        elif input.select() == "1E":
+                            ui.input_file(
+                                    "parquet_dataset",
+                                    "Choose a File",
+                                    accept=[".pqt"]
+                                )
+                            ui.p("Load a saved parquet file from data folder", style="color: gray; font-size: 10px; margin-top: -20px;")
+                            ui.input_action_button("load_parquet_file", "Start", icon=ICONS["play"])
+
+                        else:   
                             ui.p("Please select a valid action to begin managing your data.", style="color: gray;")
                             ui.p("Follow the instructions below to manage your data efficiently:")
                             ui.markdown(
@@ -738,30 +784,92 @@ def select_db():
                         # )
                         # ui.input_action_button("export_button", "Export", icon=ICONS["download"], disabled=True)
 
-                @render.express()
+                    @reactive.effect
+                    @reactive.event(input.select)
+                    def _redirect_to_api():
+                        if input.select() == "1D":
+                            ui.update_navs("hidden_tabs", selected="API")
+
+                    @reactive.effect
+                    @reactive.event(input.load_parquet_file)
+                    def load_local_parquet_file():
+                        logging.debug('App.py - Invoked "load_local_parquet_file()"')
+                        if input.select() == "1E":
+                            logging.debug('App.py - invoked "load_local_parquet_file()"')
+                            logging.debug(input.parquet_dataset())
+                            df_from_disk = load_df_from_parquet(filepath=input.parquet_dataset()[0]["datapath"])
+                            df.set(df_from_disk)
+
+                            # Trigger for opening analysis page
+                            if start_trigger.get() == 0:
+                                start_trigger.set(1)
+
+                            ui.update_navs(id='hidden_tabs', selected='import')
+        
+                
+                @reactive.effect
+                @reactive.event(input.select)
+                def _ ():
+                    logging.debug(f'App.py - Select value: "{input.select()}"')
+
+                @reactive.effect
                 @reactive.event(input.start_button)
+                def _trigger_from_file():
+                    logging.debug('App.py - Invoked "_trigger_from_file()"')
+                    start_trigger.set(1)
+                    #ui.update_navs(id='hidden_tabs', selected='import')
+                
+                @render.express()
                 def mostra():
-                    database = get_database(input)
+                    logging.debug('App.py - Invoked mostra() function')
+                    trigger = start_trigger.get()
+                    if trigger == 0:
+                        return
+                    logging.debug(f'Mostra function - trigger value: "{trigger}"')
+
+                    ui.HTML(init_itables())
+
+
+                    source = api_source.get()
+                    if source is not None:
+                        database = "PubMed" if source == "pubmed" else "OpenAlex"
+                        #api_source.set(None)  # reset per la prossima volta
+                    else:
+                        database = get_database(input)
+                    # database = get_database(input)
                     ui.update_sidebar("sidebar_load_data", show=False)
                     ui.update_action_button("export_button", disabled=False)
                     ui.markdown(f"<h3 style='text-align:center; color: #5567BB;'>Data of {database}</h3>")
 
-                    if database == "Sample":
-                        data = df.set(pd.read_excel("sources/samples/sample.xlsx"))
-                        reset_all_analyses()  # Reset analysis results when sample is loaded
-
-                    @render.express()
-                    @reactive.event(input.Dataset)
-                    def show_data():
-                        text = get_data(input, database, df, reset_all_analyses)
-                        text
-                    ui.HTML(init_itables())
-
-                    @render.ui
-                    @reactive.event(input.start_button)
-                    def show_table():
+                    logging.debug('Invoked show_table function...')
+                    table_ui = ui.h5("No data available.")
+                    try:
                         table_ui, _, _ = get_table(database, df)
-                        return table_ui
+                    except Exception as e:
+                        logging.error(e, exc_info=True)
+                    table_ui
+
+                    # if database == "Sample":
+                    #     data = df.set(pd.read_excel("sources/samples/sample.xlsx"))
+                    #     reset_all_analyses()  # Reset analysis results when sample is loaded
+
+                    # @render.express()
+                    # @reactive.event(input.Dataset)
+                    # def show_data():
+                    #     text = get_data(input, database, df, reset_all_analyses)
+                    #     #text
+                    
+
+                    # @render.ui
+                    # @reactive.event(lambda: start_trigger.get())
+                    # def show_table():
+                    #     logging.debug('Invoked show_table function...')
+                    #     table_ui = ui.h5("No data available.")
+                    #     try:
+                    #         table_ui, _, _ = get_table(database, df)
+                    #     except Exception as e:
+                    #         logging.error(e, exc_info=True)
+                    #     return table_ui
 
                     # -------- ADVICE BUTTON --------
                     @render.ui
@@ -853,9 +961,321 @@ def indicator_types_ui_all():
                     """
                 ),
 
-        with ui.nav_panel("None", value="API"):
-            ui.h3("🚧 Warning: API is under construction 🚧")
         
+        ## API SECTION ##
+        PAGINATION_LIMIT = 10000
+
+        current_query = reactive.Value(None)
+        works_per_page = reactive.Value(20)
+        search_results = reactive.Value(None)
+        full_results = reactive.Value(None)
+        max_number_pages = reactive.Value(None)
+        web_env_store = reactive.Value(None)
+        query_key_store = reactive.Value(1)
+        current_page = reactive.Value(0)
+        is_pubmed = reactive.Value(True)
+        page_limit = reactive.Value(None)
+        go_to_page_button_flag = reactive.Value(True)
+
+        with ui.nav_panel("None", value="API"):
+
+            ui.h3("PubMed or OpenAlex API", style="color: #5567BB;")
+            ui.input_select(
+                "api_select",
+                "Choose an API to import data from:",
+                {
+                    "": "-",
+                    "pubmed": "PubMed API",
+                    "openalex": "OpenAlex API"
+                },
+            )
+            
+            @render.express()
+            def render_api_selection():
+                if input.api_select() == "pubmed":
+                    ui.h4("PubMed Search", style="color: #5567BB; margin-top: 20px;")
+                    ui.p("Import data directly from PubMed using their API.", style="color: gray;")
+                    with ui.layout_column_wrap():
+                        with ui.card():
+                            ui.input_text("pubmed_query", "Enter PubMed search query (e.g. 'cancer or genomics...')")
+                        with ui.card():
+                            ui.input_numeric(id="pubmed_works_per_page", label="Select the amounts of works per page: ", min=1, max=100, value=works_per_page.get())
+                            
+                            @render.text
+                            def pubmed_works_per_page_value():
+                                value = input.pubmed_works_per_page()
+                                if value < 1 or value > 100 or not isinstance(value, int):
+                                    return 'Invalid number. Please choose an integer between 1 and 100'
+                                
+                                works_per_page.set(value)
+                                return f'Selected value: {value}'
+
+                    ui.input_action_button("handle_pubmed_search", "Search PubMed", class_="btn-primary")
+
+                elif input.api_select() == "openalex":
+                    is_pubmed.set(False)
+                    ui.h4("OpenAlex Search", style="color: #5567BB; margin-top: 20px;")
+                    ui.p("Import data directly from OpenAlex using their API.", style="color: gray;")
+                    with ui.layout_column_wrap():
+                        with ui.card():
+                            ui.input_text("openalex_query", "Enter OpenAlex search query (e.g. 'cancer or genomics...')")
+
+                        with ui.card():
+                            ui.input_numeric(id="openalex_works_per_page", label="Select the amounts of works per page: ", min=1, max=100, value=works_per_page.get())
+
+                            @render.text
+                            def openalex_works_per_page_value():
+                                value = input.openalex_works_per_page()
+                                if value < 1 or value > 100 or not isinstance(value, int):
+                                    return 'Invalid number. Please choose an integer between 1 and 100'
+                                
+                                works_per_page.set(value)
+                                return f'Selected value: {value}'
+
+                    ui.input_action_button("handle_openalex_search", "Search OpenAlex", class_="btn-primary")
+
+                results = search_results()
+                if results is not None:
+                    max_pages = max_number_pages()
+                    page = current_page()
+                    page_limit_value = page_limit.get()
+
+                    ui.h4(f"Search Results for '{current_query.get()}':", style="color: #5567BB; margin-top: 20px;")
+                    ui.h5(f"Total Number of pages: {max_pages} | Works per page: {works_per_page.get()}", style="color: gray; margin-bottom: 20px;")
+                    ui.h5(f"Current page: {page}", style="color: gray; margin-bottom: 20px;")
+                    
+                    with ui.layout_column_wrap():
+                        with ui.card(): # (optional) navigating to the page
+                            ui.input_numeric('page_number', f"Go to page: (Max {page_limit_value})", min=1, max=page_limit_value, value=int(page))
+
+                            @render.text
+                            def page_number_value():
+                                value = input.page_number()
+                                if not isinstance(value, int) or value < 1 or value > page_limit_value:
+                                    go_to_page_button_flag.set(True)
+                                    return f'Invalid number. Please choose an integer between 1 and {page_limit_value}'
+                                
+                                go_to_page_button_flag.set(False)
+                                return f'Selected value: {value}'
+                            
+                            #ui.input_text("page_number", f"Go to page: (Max {page_limit.get()})", value=str(page))
+                            ui.input_action_button("go_to_page", "Go", class_="btn-secondary", disabled=go_to_page_button_flag.get())
+
+                        with ui.card():
+                            ui.input_action_button("prev_page", "← Previous", disabled=(page <= 1))
+                            ui.input_action_button("next_page", "Next →", disabled=(page >= page_limit.get()))
+
+                    @render.data_frame
+                    def results_df():
+                        df = search_results()
+                        if df is None:
+                            return render.DataGrid(
+                                pd.DataFrame())
+
+                        results_show = df.copy()
+
+                        return render.DataGrid(
+                            results_show,
+                            filters=True,
+                            width="100%",
+                            summary=False,
+                            styles=[
+                                {
+                                    "style": {
+                                        "vertical-align": "top",
+                                        "width": "150px",
+                                        "min-width": "150px",
+                                        "max-width": "150px",
+                                        "overflow": "hidden",
+                                        "text-overflow": "ellipsis",
+                                        "white-space": "nowrap",
+                                    }
+                                }
+                            ]
+                        )
+
+                    ui.h4("Start Analysis", style="color: #5567BB; margin-top: 30px;")
+                    ui.input_action_button("api_start_button", "Start Analysis", icon=ICONS["play"])
+                    ui.input_action_button("api_save_results_to_file", "Save current page results", icon=ICONS["save"])
+                    
+            
+            @reactive.effect
+            @reactive.event(input.handle_pubmed_search)
+            def _handle_pubmed_search():
+                is_pubmed.set(True)
+                try:
+                    query = input.pubmed_query()
+                    # count = input.pubmed_count()
+                    
+                    if not query or query.strip() == "":
+                        ui.notification_show("Please enter a search query", type="error")
+                        return
+                    
+                    current_query.set(query)
+
+                    ui.notification_show(f"Searching PubMed for: {query}")
+                    query_result = search_pubmed(query)
+                    retmax = works_per_page.get()
+                    max_pages, webEnv, query_key = get_data_from_query(query_result, retmax)
+                    print(f"Max pages: {max_pages}, WebEnv: {webEnv}")
+                    max_number_pages.set(max_pages)
+                    web_env_store.set(webEnv)
+                    query_key_store.set(query_key)
+                    current_page.set(1)
+
+                    computed_limit = compute_page_limit()
+                    page_limit.set(computed_limit)
+                   
+                    first_page = get_paginated_papers_df(webEnv, query_key, page=0, retmax = retmax)
+                    search_results.set(first_page)
+                    
+
+                except Exception as e:
+                    logging.exception('pubmed search error')
+                    ui.notification_show(f"Error: {str(e)}", type="error")
+
+            def compute_page_limit():
+                computed_limit = max_number_pages.get()
+                if max_number_pages.get() >= PAGINATION_LIMIT:
+                    computed_limit = PAGINATION_LIMIT // works_per_page.get()
+                return computed_limit
+            
+            
+            @reactive.effect
+            @reactive.event(input.handle_openalex_search)
+            def _handle_openalex_search():
+                is_pubmed.set(False)
+                try:
+                    query = input.openalex_query().lower().strip()
+                    if not query or query.strip() == "":
+                            ui.notification_show("Please enter a search query", type="error")
+                            return
+
+                    current_query.set(query)
+
+                    ui.notification_show(f"Searching OpenAlex for: {query}")
+
+                    #First search, get metadata to update max_pages and display first page
+                    query_result = search_open_alex(query_str=query, works_per_page=works_per_page.get())
+                    max_number_pages.set(
+                        get_max_pages(query_result)
+                    )
+                    page_limit.set(PAGINATION_LIMIT // works_per_page.get())
+                    current_page.set(1)
+
+                    first_page = transform_to_df(query_result)
+                    search_results.set(first_page)
+
+
+                    # max_pages = get_data_from_query_open_alex(query)
+                    # max_number_pages.set(max_pages)
+
+                    #print("Max pages for OpenAlex query:", max_pages)
+                    #current_page.set(1)
+
+                    # all_df = search_open_alex(query, per_page=None)
+                    # full_results.set(all_df)
+
+                    # first_page = paginated_papers_open_alex(query, page=1)
+                    # search_results.set(first_page)
+                
+                except Exception as e:
+                    ui.notification_show(f"Error: {str(e)}", type="error")
+            
+            @reactive.effect
+            @reactive.event(input.prev_page)
+            def _go_prev():
+                page = current_page.get()
+                if page > 1: 
+                    new = page - 1
+                    current_page.set(new)
+                    if is_pubmed.get():
+                        web_env = web_env_store.get()
+                        query_key = query_key_store.get()
+                        retmax = works_per_page.get()
+                        search_results.set(get_paginated_papers_df(web_env, query_key, page=new, retmax=retmax))
+                    else:
+                        query_results = search_open_alex(query_str=input.openalex_query(), works_per_page=works_per_page.get(), current_page=new)
+                        search_results.set(
+                            transform_to_df(query_results)
+                        )
+
+            @reactive.effect
+            @reactive.event(input.next_page)
+            def _go_next():
+                page = current_page.get()
+                max_pages = max_number_pages.get()
+                if page < max_pages - 1:
+                    new = page + 1
+                    current_page.set(new)
+                    if is_pubmed.get():
+                        web_env = web_env_store.get()
+                        query_key = query_key_store.get()
+                        retmax = works_per_page.get()
+                        search_results.set(get_paginated_papers_df(web_env, query_key, page=new, retmax=retmax))
+                    else:
+                        query_results = search_open_alex(query_str=input.openalex_query(), works_per_page=works_per_page.get(), current_page=new)
+                        search_results.set(
+                            transform_to_df(query_results)
+                        )
+
+            @reactive.effect
+            @reactive.event(input.go_to_page)
+            def _go_to_page():
+                val = input.page_number()
+
+                if isinstance(val, int):
+
+                    current_page.set(val)
+
+                    if is_pubmed.get():
+                        web_env = web_env_store.get()
+                        query_key = query_key_store.get()
+                        retmax = works_per_page.get()
+                        search_results.set(get_paginated_papers_df(web_env, query_key, page=val, retmax=retmax))
+                    else:
+                        query_results = search_open_alex(query_str=input.openalex_query(), works_per_page=works_per_page.get(), current_page=val)
+                        search_results.set(
+                            transform_to_df(query_results)
+                        )
+            
+
+            @reactive.effect
+            @reactive.event(input.api_start_button)
+            def _api_start():
+                logging.debug('App.py - Invoked "_api_start()"')
+                api_source.set(input.api_select())
+                df.set(search_results.get())
+                print(df.get())
+
+                # Trigger for opening analysis page
+                if start_trigger.get() == 0:
+                    start_trigger.set(1)
+                
+                ui.notification_show("Analysis done. See  Main Information tab from the Overview side bar.")
+
+                ui.update_navs(id='hidden_tabs', selected='import')
+
+            @reactive.effect
+            @reactive.event(input.api_save_results_to_file)
+            def api_save_results_to_file():
+                logging.debug('Invoked "api_save_results_to_file()"')
+                df.set(search_results.get())
+                logging.debug(f'Results to save:\n{df.get()}\n\n')
+
+                # for col, dtype in df.get().dtypes.items():
+                #     logging.debug(f"Column: {col:<20} Type: {dtype}")
+
+                query_text = current_query.get()
+                if isinstance(query_text, str) and query_text.strip() != "":
+                    status = save_api_results_to_file(folder_path=r'.\data', file_name=f'{query_text} search results.pqt', df_to_save=df.get())
+                    logging.debug(f'app.py - status value: "{status}"')
+
+                    if not status:
+                        ui.notification_show("Something went wrong .", type='error')
+                    else:
+                        ui.notification_show(f'Local file updated: "{query_text} search results.pqt"')
+
         with ui.nav_panel("None", value="collections"):
             ui.h3("🚧 Warning: Merge Collection is under construction 🚧")
 
@@ -8185,8 +8605,10 @@ def update_plot_settings():
 
 # --- Sidebar Management ---
 @render.express()
-@reactive.event(input.start_button)
 def toggle_sidebar():
+    trigger = start_trigger.get()
+    if trigger == 0:
+        return
     with ui.tags.div(id="sidebar_2", class_="custom-sidebar"):
         with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False):
             # Info Section
diff --git a/functions/__init__.py b/functions/__init__.py
index 20e24de36..f0107e8c5 100644
--- a/functions/__init__.py
+++ b/functions/__init__.py
@@ -40,4 +40,6 @@
 from .get_thematicevolution import *
 from .get_cocitation import *
 from .get_collaborationnetwork import *
-from .get_worldmapcollaboration import *
\ No newline at end of file
+from .get_worldmapcollaboration import *
+from .save_api_results_to_file import *
+from .load_df_from_parquet import *
\ No newline at end of file
diff --git a/functions/get_database.py b/functions/get_database.py
index 5c5d4edc5..7140ee96c 100644
--- a/functions/get_database.py
+++ b/functions/get_database.py
@@ -1,3 +1,4 @@
+import logging
 from www.services import *
 
 
@@ -11,27 +12,41 @@ def get_database(input):
     Returns:
         A string representing the name of the database.
     """
-    if input.select() == "1A":  # Bibliographic databases
+    database = ''
+    try:
+        if input.select() == "1A":  # Bibliographic databases
+            
+            database = input.database()
+            
+            if database == "wos":
+                database = "Web of Science"
+            elif database == "scopus":
+                database = "Scopus"
+            elif database == "dimensions":
+                database = "Dimensions"
+            elif database == "lens":
+                database = "Lens.org"
+            elif database == "pubmed":
+                database = "PubMed"
+            elif database == "cochrane":
+                database = "Cochrane Library"
         
-        database = input.database()
+        elif input.select() == "1B":  # Bibliometrix database
+            database = "Bibliometrix"
         
-        if database == "wos":
-            database = "Web of Science"
-        elif database == "scopus":
-            database = "Scopus"
-        elif database == "dimensions":
-            database = "Dimensions"
-        elif database == "lens":
-            database = "Lens.org"
-        elif database == "pubmed":
-            database = "PubMed"
-        elif database == "cochrane":
-            database = "Cochrane Library"
-    
-    elif input.select() == "1B":  # Bibliometrix database
-        database = "Bibliometrix"
-    
-    elif input.select() == "1C":  # Sample database
-        database = "Sample"
-    
+        elif input.select() == "1C":  # Sample database
+            database = "Sample"
+
+        elif input.select() == "1E":
+            database = "Local parquet file"
+
+        elif input.api_select() in ['pubmed', 'openalex']:
+            database_name = input.api_select()
+            if database_name == 'pubmed':
+                database = 'PubMed'
+            if database_name == 'openalex':
+                database = 'OpenAlex'
+    except Exception as e:
+        logging.error(f'Error: \n{e}\n\n')
+
     return database
diff --git a/functions/get_localcitedsources.py b/functions/get_localcitedsources.py
index 74b261455..f188724a9 100644
--- a/functions/get_localcitedsources.py
+++ b/functions/get_localcitedsources.py
@@ -1,5 +1,7 @@
 from www.services import *
-
+import numpy as np
+import logging
+import pandas as pd
 
 def get_local_cited_sources(df, num_of_cited_sources):
     """
@@ -99,6 +101,9 @@ def wrap_label(label, width=50):
 
     # Set x-axis ticks to 0, 50, 100, etc.
     max_x = source_counts["N. of Local Citations"].max()
+    logging.debug(f'get_localCitedSources.py - max_x value= "{max_x}", max_x type: "{type(max_x)}"')
+    if pd.isna(max_x):
+        max_x = 10
     tick_step = 50
     x_ticks = list(range(0, int(max_x) + tick_step, tick_step))
     if x_ticks[-1] < max_x:
diff --git a/functions/get_table.py b/functions/get_table.py
index 75b9c91d8..34293996e 100644
--- a/functions/get_table.py
+++ b/functions/get_table.py
@@ -1,5 +1,6 @@
 from www.services import *
 from functions.get_status import *
+import logging
 
 
 # Function to create a Plotly table visualization for metadata completeness
@@ -78,8 +79,11 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
     Returns:
         A DataTable object if data is available, otherwise a message indicating no data.
     """
+    logging.debug('get_table - Invoked "get_table()"')
     # Retrieve the data from the DataFrame
     data = df.get()
+    logging.debug(f'data: \n{data}')
+
 
     table_html = ""
     fig = None
@@ -125,8 +129,17 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
         }
 
         # Count missing values (NaN), empty strings, and empty lists in each column
-        missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + (
-            data.map(lambda x: x == [])).sum()
+        def is_empty_string(x):
+            # If it's a list, numpy array, or dictionary, it's not an empty string
+            if hasattr(x, '__len__') and not isinstance(x, (str, bytes)):
+                return False
+            return x in ["", " "]
+
+        is_missing = data.isna() | data.map(is_empty_string)
+        missing_counts = is_missing.sum()
+
+        # missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + (
+        #     data.map(lambda x: x == [])).sum()
 
         # Calculate the percentage of missing values for each column
         missing_percentage = (missing_counts / total_rows) * 100
@@ -149,6 +162,9 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
         # Create and return the Plotly table
         fig = create_plotly_table(sorted_columns, dpi)
 
+        logging.debug('get_table - FIG OBJECT')
+        logging.debug(fig)
+
         # HTML table header
         table_header = """
         <table style="width:100%; border-collapse: collapse;">
diff --git a/functions/load_df_from_parquet.py b/functions/load_df_from_parquet.py
new file mode 100644
index 000000000..d32b3f34d
--- /dev/null
+++ b/functions/load_df_from_parquet.py
@@ -0,0 +1,22 @@
+import pandas as pd
+import logging
+
+def _clean_numpy_collections(df: pd.DataFrame) -> pd.DataFrame:
+    """Converts any embedded numpy arrays inside object columns back to native python lists."""
+    import numpy as np
+    
+    # Target only 'object' columns which hold strings, lists, or arrays
+    object_cols = df.select_dtypes(include=['object']).columns
+    
+    for col in object_cols:
+        # Check if the column contains any numpy arrays
+        if df[col].apply(lambda x: isinstance(x, np.ndarray)).any():
+            df[col] = df[col].apply(lambda x: list(x) if isinstance(x, np.ndarray) else x)
+            
+    return df
+
+def load_df_from_parquet(filepath):
+    df_local = pd.read_parquet(filepath)
+    df_local = _clean_numpy_collections(df_local)
+    logging.debug(f'load_df_from_parquet - {df_local}')
+    return df_local
\ No newline at end of file
diff --git a/functions/save_api_results_to_file.py b/functions/save_api_results_to_file.py
new file mode 100644
index 000000000..541eeea4b
--- /dev/null
+++ b/functions/save_api_results_to_file.py
@@ -0,0 +1,71 @@
+from pathlib import Path
+import pandas as pd
+import logging
+
+def save_api_results_to_file(folder_path: str, file_name: str, df_to_save: pd.DataFrame) -> bool:
+    """
+    Function to save the results of an API call to local storage. The saved files are grouped by "query name".
+
+    Returns:
+        bool: True if ok, False if error
+    """
+    if not isinstance(folder_path, str):
+        logging.error(f'Arg "folder_path" is of type "{type(folder_path)}", expected a str')
+        return False
+    if not isinstance(file_name, str):
+        logging.error(f'Arg "file_name" is of type "{type(file_name)}", expected a str')
+        return False
+    if not isinstance(df_to_save, pd.DataFrame):
+        logging.error(f'Arg "df_to_save" is of type "{type(df_to_save)}", expected a pd.DataFrame')
+        return False
+    
+    logging.debug(df_to_save.info(verbose=True))
+
+    dir_path = Path(folder_path)
+    dir_path.mkdir(parents=True, exist_ok=True)
+
+    file_path = dir_path / file_name
+
+    if not file_path.exists():
+        logging.warning(f"File not found. Initializing a new dataset for {file_path}...")
+        df_local = df_to_save.iloc[0:0].copy()
+    else:
+        try:
+            df_local = pd.read_parquet(file_path)
+            logging.debug(df_local.info(verbose=True))
+        except Exception as e:
+            logging.exception(f"Error reading the file: {file_path}")
+            return False
+
+
+    if set(df_local.columns) == set(df_to_save.columns):
+        logging.info("Structures match. Checking for new rows...")
+
+        combined = pd.concat([df_local, df_to_save], ignore_index=True)
+
+        merged = combined.drop_duplicates(subset=["UT"], keep="last")
+
+        new_rows_count = len(merged) - len(df_local)
+
+        logging.debug(merged.info(verbose=True))
+        logging.debug(merged)
+        logging.debug(new_rows_count)
+
+        if new_rows_count > 0:
+            try:
+                merged.to_parquet(file_path)
+                logging.info(f"Successfully added {new_rows_count} new rows to the file.")
+            except Exception as e:
+                logging.error('Something went wrong. Did not update file')
+                logging.error(str(e))
+                return False
+        else:
+            logging.info("No new rows to add. File is up to date.")
+
+    else:
+        logging.error(
+            "Structure mismatch! The file structure does not match the DataFrame in memory."
+        )
+        return False
+
+    return True
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index d94f94d9fde545db192036336adda97dbb14fb42..94705efb0073c7b53cb4b2be5f26822f46f2cd9b 100644
GIT binary patch
delta 359
zcmbQF@<Me(7$dhCgC2t^5F0TVOipABlr?AIW#D2+X2@a41d`bdwm@hBls5rl!^!6v
z)5VJz5*czCvcM!r)(9j6l+$K<uAazH%uvEm0F*0YC}b#QNComBY7BttV0IZy=4RHG
zPXnqg2C7X3+JmYFq|RWoA+s?fquJzCmgU054EaC@fD{>kq=49HGAnB_E6Cx-n`>EL
zGs&8O-JJ)tGZ$!PCDcTazw{W)Cbx2|W;EVx$jQkFbbb-gv~s8dkhj2QZr10T&cq3G
zq2=U@Jo2m{Nu$YMdDbd|{80)tw*;s)9~d^pVC@!Q(?Qw|C(q_x%xJ#Zo==~V(Rgw`
N|93{S&FKQG838RtL|y;@

delta 187
zcmaE%I!R?i7$dhSgC2tk5F0U=O-^JClr>`DW#D2+X2@a41d`bdwm@hBls5!o^U3EK
z(<etT-I<)ktg?9(vmPU(!Q`7P%O}rb&0z%_vYDIhHIu9%*o-`eQifcH0)|Sk8Ad?;
z1|WMT3v;e!G}^qMlaq1tE3O_UZjeSpu=!?_>v`o_K`M<WFXml4xq@#FquFLretnQV
Pt^(f~O*Y#Ku4V)PCp;>~

diff --git a/www/services/format_functions.py b/www/services/format_functions.py
index 1a8ee7af4..11f8ac762 100644
--- a/www/services/format_functions.py
+++ b/www/services/format_functions.py
@@ -523,7 +523,11 @@ def format_di_column(entry, source, file_type):         # Function for DI Column
             doi = entry.get('DI', [''])[0]
     elif source == 'PubMed':
         if file_type == '.txt':
-            doi = entry.get('LID', '')
+            doi_pattern = r'(.+?)\s*\[doi\]$'
+            doi_raw = entry.get('LID', '')
+            for doi_value in doi_raw.split(";"):
+                if re.match(doi_pattern, doi_value):
+                    return doi_value.replace('[doi]', '').strip()
     elif source == 'Scopus':
         if file_type == '.bib':
             doi = entry.get('doi', '')
@@ -993,6 +997,8 @@ def format_py_column(entry, source, file_type):         # Function for PY Column
         if file_type == '.txt':
             publication_year = entry.get('DP', '')
             publication_year = re.findall(r'\d{4}', publication_year)[0] if publication_year else ''
+            if publication_year != '':
+                publication_year = int(publication_year) if publication_year.isdigit() else 0
     elif source == 'Scopus':
         if file_type == '.bib':
             publication_year = str(entry.get('year', ''))
@@ -1627,11 +1633,11 @@ def process_single_file(data, source, file_type, author):
             if column not in entry_data:  # Avoid overwriting existing keys
                 entry_data[column] = entry.get(column, None)
         
-        # Remove the column based on the value of the 'author' field
-        if author == "surname":
-            entry_data.pop('AF', None)  # Remove 'AF' if it exists
-        elif author == "fullname":
-            entry_data.pop('AU', None)  # Remove 'AU' if it exists
+        # # Remove the column based on the value of the 'author' field
+        # if author == "surname":
+        #     entry_data.pop('AF', None)  # Remove 'AF' if it exists
+        # elif author == "fullname":
+        #     entry_data.pop('AU', None)  # Remove 'AU' if it exists
 
         entries.append(entry_data)
 
diff --git a/www/services/openalex/__init__.py b/www/services/openalex/__init__.py
new file mode 100644
index 000000000..78883939f
--- /dev/null
+++ b/www/services/openalex/__init__.py
@@ -0,0 +1,6 @@
+import nltk
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+logging.getLogger('parser').setLevel(logging.WARNING)
+nltk.download('wordnet')
\ No newline at end of file
diff --git a/www/services/openalex/api_service.py b/www/services/openalex/api_service.py
new file mode 100644
index 000000000..6a2d7ea1e
--- /dev/null
+++ b/www/services/openalex/api_service.py
@@ -0,0 +1,46 @@
+import requests
+from .utils import *
+import pandas as pd
+from .parser import transform_from_open_alex
+
+OPEN_ALEX_KEY = "wi0R0MWb5Dy1mtZv0OMMn5"
+OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?api_key={OPEN_ALEX_KEY}&search="
+#OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?search="
+
+def search_open_alex(query_str: str, current_page:int =1, works_per_page:int =20) -> list[dict]:
+    """Contact the openalex API to retrieve information about the desired topic in JSON format
+    
+    Args:
+        query_str (str): Openalex Query Search Parameter
+        current_page (int): Current page
+        works_per_page (int): Number of works per page
+
+    Returns:
+        list[dict]: Openalex results organized in a list of dicts, ready to be transformed in a DataFrame
+        """
+    if not isinstance(query_str, str):
+        raise ValueError(f'Expecting a str for "query_str", got "{type(query_str)}"')
+    if not isinstance(current_page, int) or current_page<=0:
+        raise ValueError(f'Expected a positive int for "current_page", got "{type(current_page)}"')
+    if not isinstance(works_per_page, int) or works_per_page<=0:
+        raise ValueError(f'Expected a positive int for "works_per_page", got "{type(works_per_page)}"')
+
+    full_request = f"{OPEN_ALEX_ENDPOINT}{query_str}&per_page={works_per_page}&page={current_page}"
+
+    try:
+        response = requests.get(full_request)
+        response.raise_for_status()
+        data = response.json()
+        return data
+    except Exception as e:
+        raise e
+
+def test():
+    query = "multiverse"
+    data = search_open_alex(query)
+    results = get_results_open_alex(data)
+    df = pd.DataFrame(results)
+    
+
+# if __name__ == "__main__":
+#     test()
\ No newline at end of file
diff --git a/www/services/openalex/pagination.py b/www/services/openalex/pagination.py
new file mode 100644
index 000000000..55498d622
--- /dev/null
+++ b/www/services/openalex/pagination.py
@@ -0,0 +1,80 @@
+from .api_service import search_open_alex
+from .parser import transform_from_open_alex
+from .utils import *
+import pandas as pd
+import logging
+
+def get_max_pages(open_alex_json: dict[dict]) -> int | str:
+    """Calculate the total number of pages available for a specific topic
+    
+    Args:
+        open_alex_json (list[dict]): Openalex json file from which to extract metadata information
+        
+    Results:
+        int | str: The maximum number of pages for a topic. If metadata is not available will return str"""
+
+    logging.debug('Invoked get_max_pages\n')
+    logging.debug(f'Arg: open_alex_json. Arg type: {type(open_alex_json)}\n')
+
+    if not isinstance(open_alex_json, dict):
+        logging.error(f'Expected a dict for arg "open_alex_json", got "{type(open_alex_json)}".')
+        return "Metadata unavailable"
+
+    metadata = open_alex_json.get('meta')
+    logging.debug(f'Metadata:\n{metadata}')
+
+    if not metadata:
+        logging.error('No metadata available.')
+        return "Metadata unavailable"
+    
+    works_count = metadata.get('count')
+    works_per_page = metadata.get('per_page')
+
+    if not works_count or not works_per_page:
+        logging.error(f'Malformed metadata.')
+        return "Metadata unavailable"
+    
+    if not isinstance(works_count, int) or not isinstance(works_per_page, int):
+        logging.error(f'Expected int for "works_count" and "works_per_page", got "{type(works_count)}" and "{type(works_per_page)}".')
+        return "Metadata unavailable"
+    
+    return works_count//works_per_page
+
+
+def transform_to_df(open_alex_json: dict[dict]) -> pd.DataFrame:
+    """Transform an openalex json to a pandas dataframe
+    
+    Args:
+        open_alex_json (list[dict]): Openalex json file to convert to DataFrame
+        
+    Results:
+        pd.DataFrame: OPenalex query results converted to pandas DataFrame
+    """
+    if not isinstance(open_alex_json, dict):
+        raise ValueError(f'Expected a dict for arg "open_alex_json", got "{type(open_alex_json)}".')
+    
+    json_results = open_alex_json.get('results')
+
+    if not isinstance(json_results, list):
+        raise ValueError(f'Expected a list for field "results" in JSON. Got "{type(json_results)}"')
+
+    return transform_from_open_alex(pd.DataFrame(json_results))
+
+# def get_data_from_query_open_alex(query):
+#     data = search_open_alex(query)
+#     total_results = get_count_open_alex(data)
+
+#     max_pages = (total_results // 20)
+#     return max_pages
+
+# def paginated_papers_open_alex(query, page):
+#     data = search_open_alex(query, page=page)
+#     results_json = get_results_open_alex(data)
+#     results_df = pd.DataFrame(results_json)
+
+#     return transform_from_open_alex(results_df)
+
+
+
+
+    
\ No newline at end of file
diff --git a/www/services/openalex/parser.py b/www/services/openalex/parser.py
new file mode 100644
index 000000000..9ff5e413d
--- /dev/null
+++ b/www/services/openalex/parser.py
@@ -0,0 +1,493 @@
+import pandas as pd
+import json
+import numpy as np
+from iso4 import abbreviate
+import nltk
+import logging
+from itertools import chain
+    
+    
+def _fetch_value(data: pd.Series, keys:list[str], is_return_list=False):
+    '''Retrieve a value for the specified key-path in a pandas' Series object.
+    This is intended to be used on a JSON object converted to DataFrame object.
+
+    Args:
+        data (pd.Series): Pandas' Series object from which to retrieve the information
+        keys (list[str]): Ordered keys from top level to desired level.  Eg. ['Top-level', 'First-nested level', 'Second-nested level', ...]
+        is_return_list (Bool): Specify if empty return must be a list or a str
+
+    Returns:
+        Any | [] | "": Nested value associated with the specified key or empty list/str for invalid results'''
+    
+    if not isinstance(data, pd.Series):
+        raise ValueError(f'Data is of type {type(data)}. Must be pandas.Series!')
+    
+    if not isinstance(keys, list):
+        raise ValueError(f'Keys is of type {type(keys)}. Must be list of strings!')
+    
+    for key in keys:
+        if not isinstance(key, str):
+            raise ValueError(f'Key: {key} from keys: {keys} is of type: {type(key)}. Must be string!')
+        
+    empty = [] if is_return_list else ""
+    value = data.to_dict()
+
+    for key in keys:
+        if not isinstance(value, dict):
+            logging.warning(
+                f'Expected a dict, got {type(value)} for key "{key}" in keys "{keys}" for data:\n{data}'
+                f'Returning empty {"list" if is_return_list else "string"}.'
+            )
+            return empty
+
+        value = value.get(key)
+
+        if value is None or (not isinstance(value, (list, dict)) and pd.isna(value)):
+            logging.warning(
+                f'Value is missing or NaN for key "{key}" in keys: "{keys}" in data:\n{data}'
+                f'Returning empty {"list" if is_return_list else "string"}.'
+            )
+            return empty
+
+    return value
+
+
+def _format_author_name(display_name: str) -> str:
+    """Convert 'Firstname [Middlename] Lastname' to 'Lastname FI' format.
+    
+    Args:
+        display_name (str): Author full-name to convert
+
+    Returns:
+        str: Author name converted to 'Lastname FI' format
+    """
+    if not isinstance(display_name, str):
+        raise ValueError(f'Expected string for display_name "{display_name}". Got {type(display_name)}')
+
+    if not display_name or not display_name.strip():
+        logging.warning(f'Display_name "{display_name}" is empty. Returning "" ')
+        return ""
+    
+    parts = display_name.strip().split()
+    
+    if len(parts) == 1:
+        return parts[0]  # single name, return as-is
+    
+    surname = parts[-1]
+    initials = "".join(f"{p[0]}." for p in parts[:-1])
+    
+    return f"{surname} {initials}"
+
+
+def _calculate_JI(name_to_abbreviate: str) -> str:
+    """Calculate the abbreviated form of the Journal name following ISO4 standard.
+    
+    Args:
+        name_to_abbreviate (str): Journal name to abbreviate
+        
+    Returns:
+        str: Abbreviated Journal name"""
+    logging.debug(f'Invoked _calculate_JI\nArg type:{type(name_to_abbreviate)}\nArg:\n{name_to_abbreviate}\n\n')
+
+    if not isinstance(name_to_abbreviate, str):
+        raise ValueError(f'Expected string for name_to_abbreviate "{name_to_abbreviate}". Got {type(name_to_abbreviate)}')
+
+    if not name_to_abbreviate or not name_to_abbreviate.strip():
+        logging.warning(f'name_to_abbreviate "{name_to_abbreviate}" is empty. Returning "" ')
+        return ""
+    
+    try:
+        return(abbreviate(name_to_abbreviate))
+    
+    except Exception as e:
+        logging.warning(f'WARNING! Got exception \n{e}\nReturning empty value')
+        return ""
+
+
+def _calculate_AU_or_AF(authorship_list: list[dict], fullname=False) -> list[str]:
+    """Retrieve the names of the authors in either short or fullname format
+    
+    Args:
+        authorship_list (list[dict]): authorships field in the open_alex json response
+        fullname (bool): Flag for returning fullname or short name
+        
+    Returns:
+        list[str]: List containing the names of the authors of the work"""
+    logging.debug(f'Invoked _calculate_AU_or_AF\nArg type:{type(authorship_list)}\nArg:\n{authorship_list}\n\n')
+
+    if not isinstance(authorship_list, list):
+        logging.warning(f'Expectinga list, (got {type(authorship_list)}). \nAuthorships_list: {authorship_list}\nReturning empty list')
+        return []
+
+    if not authorship_list or authorship_list is None:
+        logging.warning('Authorship_list is empty or None. Returning empty list')
+        return []
+    
+    authors_list = []
+
+    for authorship in authorship_list:
+        if not isinstance(authorship, dict):
+            logging.warning(f'Expected a dict, got {type(authorship)}')
+            continue
+        
+        author = authorship.get('author')
+        if not isinstance(author, dict):
+            logging.warning(f'Expected a dict, got {type(author)}')
+            continue
+
+        author_name = author.get('display_name')
+        if author_name is None or not author_name.strip():
+            logging.warning(f'Missing display_name in authorship entry: {authorship}')
+            continue
+
+        authors_list.append(_format_author_name(author_name) if not fullname else author_name)
+
+    return authors_list
+
+
+def _calculate_C1(authorships_list: list[dict]) -> list[str]:
+    """
+    Calculate authors affiliations and returns them as a list of strings.
+
+    Args:
+        authorships_list (list[dict]): OpenAlex's response Json's authorship field
+    
+    Returns:
+        list[str]: A list of the authors affiliations 
+    """
+    logging.debug(f'Invoked _calculate_C1\nArg type:{type(authorships_list)}\nArg:\n{authorships_list}\n')
+
+    if not isinstance(authorships_list, list):
+        logging.warning(f'Expected a list, got {type(authorships_list)}: {authorships_list}\nReturning empty list\n')
+        return []
+
+    if not authorships_list:
+        logging.warning(f'Empty authorships list. Returning empty list\n')
+        return []
+    
+    affiliations_set = set()
+    for authorship in authorships_list:
+        if not isinstance(authorship, dict):
+            logging.warning(f'Expected a dict, got "{type(authorship)}"')
+            continue
+
+        raw_affiliation_strings = authorship.get('raw_affiliation_strings')
+        if not isinstance(raw_affiliation_strings, list):
+            logging.warning(
+                f'Expected a list for raw_affiliation_strings, '
+                f'got {type(raw_affiliation_strings)}: {authorship}\n'
+            )
+            continue
+
+        for affiliation_string in raw_affiliation_strings:
+            if not isinstance(affiliation_string, str):
+                logging.warning(f'Expected a str, got "{type(affiliation_string)}". Skipping...')
+                continue
+            affiliations_set.add(affiliation_string)
+
+    return list(affiliations_set)
+    
+
+def _get_first_authorship(authorships_list: list[dict]) -> dict | None:
+    '''
+    Helper function to get the first authorship of an authorship list if present.
+
+    Args:
+        authorships_list (list[dict]): An authorships object from an open_alex.json search results file
+
+    Returns:
+        dict | None: The authorship if present, otherwise None
+    '''
+
+    logging.debug(f'Invoked _get_first_authorship\nArg type:{type(authorships_list)}\nArg:\n{authorships_list}\n')
+
+    if not isinstance(authorships_list, list):
+        logging.warning(f'Expected a list, got {type(authorships_list)}: {authorships_list}\nReturning None\n')
+        return None
+
+    if not authorships_list:
+        logging.warning('Empty authorships list. Returning None\n')
+        return None
+    
+    first_authorship = None
+    for authorship in authorships_list: #Loop to find first author
+
+        if not isinstance(authorship, dict):
+            logging.warning(f'Expecting a dict, got "{type(authorship)}". Skipping...\n')
+            continue
+        
+        author_position = authorship.get('author_position')
+        if not isinstance(author_position, str): 
+            logging.warning(f'Expected a str for author_position, got "{type(author_position)}". Skipping... ')
+            continue
+
+        if author_position == 'first':
+            first_authorship = authorship
+            break
+
+    if not first_authorship:
+        logging.warning(f'No first author found in {authorships_list}\nReturning None')
+        return None
+    
+    return first_authorship
+
+
+def _calculate_RP(authorships_list: list[dict]) -> str:
+    '''
+    Create the reprint address for the first author.
+    <First Author abbreviated name> (CORRESPONDING AUTHOR) <First affiliation>
+    If author is missing will return empty string. If affiliation is missing, will return
+    just the author.
+
+    Args:
+        authorships_list (list[dict]): OpenAlex's Json response's authorship field
+    Returns:
+        "" | str: A string containing the first author's name and first affiliation address if present
+    '''
+    logging.debug(f'Invoked _calculate_RP\nArg type:{type(authorships_list)}\nArg:\n{authorships_list}\n')
+
+    if not isinstance(authorships_list, list):
+        logging.warning(f'Expected a list, got {type(authorships_list)}: {authorships_list}\nReturning empty str\n')
+        return ""
+
+    if not authorships_list:
+        logging.warning(f'Empty authorships list. Returning empty str\n')
+        return ""
+    
+    first_authorship = _get_first_authorship(authorships_list)
+
+    if not first_authorship:
+        logging.warning(f'No first authorship found! \n{authorships_list}\nReturning empty str')
+        return ""
+
+    author_name = first_authorship.get('raw_author_name')
+    author_affiliations = first_authorship.get('raw_affiliation_strings')
+
+    if not isinstance(author_name, str):
+        logging.warning(
+            'Expected a str for raw_author_name, '
+            f'got {type(author_name)}: {first_authorship}\n'
+        )
+        return ""
+
+    reprint_address = f'{_format_author_name(author_name)} (CORRESPONDING AUTHOR)'
+
+    if not isinstance(author_affiliations, list) or not author_affiliations:
+        logging.warning(f'Expected a non-empty list, got "{type(author_affiliations)}". Returning incomplete reprint address "{reprint_address}"')
+        return reprint_address
+   
+    first_affiliation = author_affiliations[0]
+    if not isinstance(first_affiliation, str):
+        logging.warning(f'Expected a str, got "{type(first_affiliation)}". Returning incomplete reprint address "{reprint_address}"')
+        return reprint_address
+
+    reprint_address += f" {first_affiliation}"
+
+    return reprint_address
+
+
+def _calculate_DE_and_ID(keyword_list: list[dict]) -> list[str]:
+    '''
+    Extract the keywords from Open_alex's json response's keyword field and return them as a list of str
+    Args:
+        keyword_list (list[dict]): OpenAlex's keyword field
+    Returns:
+        list[str]: A list containing the work's keywords
+    '''
+    logging.debug(f'Invoked _calculate_DE_and_ID\nArg type:{type(keyword_list)}\nArg:\n{keyword_list}\n')
+
+    if not isinstance(keyword_list, list):
+        logging.warning(f'Expected a list, got {type(keyword_list)}: {keyword_list}\nReturning empty list\n')
+        return []
+
+    if not keyword_list:
+        logging.warning('Empty keywords list. Returning empty list\n')
+        return []
+    
+    result = []
+    for keyword_dict in keyword_list:
+        if not isinstance(keyword_dict, dict):
+            logging.warning(f'Expected a dict, got "{type(keyword_dict)}" Skipping...')
+            continue
+        
+        keyword = keyword_dict.get('display_name')
+        
+        if not isinstance(keyword, str) or not keyword.strip():
+            logging.warning(f'Expected a non-empty str, got "{type(keyword)}" Skipping...')
+            continue
+        
+        result.append(keyword)
+       
+            
+    return result
+    
+    
+def _calculate_AB(abstract_inverted_index: dict) -> str:
+    '''
+    This function takes as input open_alex's json responses's "abstarct_inverted_index" field, which is a dict with words as keys 
+    and index position as values. It returns the reconstructed sentence.
+    Args:
+        abstract_inverted_index (dict): A dictionary containing words as keys and their index postion as values
+    Returns:
+        str: A string of the reconstructed abstract
+    '''
+    
+    logging.debug(f'Invoked _calculate_AB\nArg type:{type(abstract_inverted_index)}\nArg:\n{abstract_inverted_index}\n')
+    
+    if not isinstance(abstract_inverted_index, dict):
+        logging.warning(f'Expected a dict, got {type(abstract_inverted_index)}: {abstract_inverted_index}\nReturning empty str\n')
+        return ""
+    
+    if not abstract_inverted_index:
+        logging.warning(f'Dictionary is empty: {abstract_inverted_index}\nReturning empty str\n')
+        return ""
+    
+    try:
+        max_index = max(list(chain(*abstract_inverted_index.values())))
+        abstract_template = [""] * (max_index+1)
+
+        for word, index_list in abstract_inverted_index.items():
+            for index in index_list:
+                abstract_template[index] = word
+        
+        abstract = " ".join(abstract_template)
+    except ValueError as e:
+        logging.warning(f'Unexpected error!\n\n {e} \n\nReturning empty str')
+        return ""
+
+    return abstract
+    
+
+def _calculate_SR(first_authorship: dict, release_year: int, journal_name: str) -> str:
+    '''
+    Calculate the Short Reference -> "<First author name>, <release year>, <journal name>"
+    If some parts are missing they will be excluded from the result
+
+    Args:
+        first_authorship (dict): The first authorship dictionary
+        release_year (int): The release year of the work
+        journal_name (str): The full name of the journal
+
+    Returns:
+        The calculated short reference as a str
+    '''
+    logging.debug(f'Invoked _calculate_SR\nArg type:{type(first_authorship)}\nArg:\n{first_authorship}\n')
+    logging.debug(f'Arg type:{type(release_year)}\nArg:\n{release_year}\nArg type:{type(journal_name)}\nArg:\n{journal_name}\n')
+
+    is_release_year_valid = True
+    is_journal_name_valid = True
+
+
+    # Correct type checks
+    if not isinstance(first_authorship, dict):
+        logging.warning(f'Expected a dict for first_authorship, got a {type(first_authorship)} -> {first_authorship}\nReturning empty str')
+        return ""
+    if not isinstance(release_year, int):
+        logging.warning(f'Expected an int for release_year, got a {type(release_year)} -> {release_year}\nOmitting it in the result.')
+        is_release_year_valid = False
+    if not isinstance(journal_name, str):
+        logging.warning(f'Expected a str for journal_name, got a {type(journal_name)} -> {journal_name}\nOmitting it in the result.')
+        is_journal_name_valid = False
+
+    # Empty args check
+    if not first_authorship:
+        logging.warning('Empty first_authorship value. Returning empty str')
+        return ""
+    if not is_journal_name_valid or not journal_name:
+        logging.warning('Empty journal_name value. Omitting it in the result')
+        is_journal_name_valid = False
+    
+    # Fetch first author name
+    author_name = first_authorship.get('raw_author_name')
+    if not isinstance(author_name, str) or not author_name.strip():
+        logging.warning(f'First author name could not be found! {first_authorship}\n Returning empty str')
+        return ""
+    author_name = _format_author_name(author_name)
+
+    short_reference_template = [author_name]
+    if is_release_year_valid:
+        short_reference_template.append(str(release_year))
+    if is_journal_name_valid:
+        short_reference_template.append(journal_name)
+
+    short_reference = ", ".join(short_reference_template)
+
+    return short_reference
+    
+    
+
+def transform_from_open_alex(input_df: pd.DataFrame) -> pd.DataFrame:
+    """Transformation function to convert an Open_alex JSON to the bibliometrix standard format.
+    
+    Args:
+        input_df (pd.DataFrame): Open_alex JSON results converted to Pandas DataFrame
+    
+    Returns:
+        pd.DataFrame: Bibliometrix standard format DataFrame"""
+    
+    logging.debug(f'Invoked "transform_from_open_alex".\n{input_df}\n\n')
+
+    if not isinstance(input_df, pd.DataFrame):
+        raise ValueError(f'Expected a pd.DataFrame, got "{type(input_df)}"!')
+
+    result = []
+    
+    for index, row in input_df.iterrows():
+        try:
+            authorships = _fetch_value(row, ['authorships'])
+            keywords = _fetch_value(row, ['keywords'], is_return_list=True)
+
+            row_template = {
+                "DB": "open_alex",
+                "UT":   _fetch_value(row, ["id"]),
+                "DI":   _fetch_value(row, ["doi"]),
+                "PMID": _fetch_value(row, ["ids", "pmid"]),
+                "TI":   _fetch_value(row, ["title"]),
+                "SO":   _fetch_value(row, ['primary_location', 'source', 'display_name']),
+                "JI":   _calculate_JI(_fetch_value(row, ['primary_location', 'source', 'display_name'])),
+                "PY":   _fetch_value(row, ['publication_year']),
+                "DT":   _fetch_value(row, ['type']),
+                "LA":   _fetch_value(row, ['language']),
+                "TC":   _fetch_value(row, ['cited_by_count']),
+                "AU":   _calculate_AU_or_AF(authorships),
+                "AF":   _calculate_AU_or_AF(authorships, fullname=True),
+                "C1":   _calculate_C1(authorships),
+                "RP":   _calculate_RP(authorships),
+                "CR":   _fetch_value(row,[ 'referenced_works'], is_return_list=True), 
+                "DE":   _calculate_DE_and_ID(keywords),
+                "ID":   _calculate_DE_and_ID(keywords), 
+                "AB":   _calculate_AB(_fetch_value(row, ['abstract_inverted_index'])),
+                "VL":   _fetch_value(row, ['biblio', 'volume']),
+                "IS":   _fetch_value(row, ['biblio', 'issue']),
+                "BP":   _fetch_value(row, ['biblio', 'first_page']),
+                "EP":   _fetch_value(row, ['biblio', 'last_page']),
+                "SR":   _calculate_SR(
+                    first_authorship=_get_first_authorship(authorships),
+                    release_year=_fetch_value(row, ['publication_year']),
+                    journal_name=_fetch_value(row, ['primary_location', 'source', 'display_name'])
+                )
+            }
+
+            #logging.debug(f'Row "#{index}":\n{row_template}\n\n')
+
+            result.append(row_template)
+
+        except Exception as e:
+            logging.warning(f'Omitting row "#{index}" due to exception:\n{e}\n\n')
+        
+    return pd.DataFrame(result)
+
+#nltk.download('wordnet')
+
+# def transform_to_df(data):
+#     return pd.DataFrame(data)
+
+# def _load_json_from_file(filename):
+#     with open(filename, 'r') as f:
+#         data = json.load(f)
+#     return data
+# input_df = transform_to_df(_load_json_from_file('open_alex_motorcycle_results.json'))
+# output_df = transform_from_open_alex(input_df)
+# print(output_df.head(20))
+# with open('open_alex_result.csv', 'w', encoding='utf-8') as f: # Print to file to check
+#     f.write(output_df.to_csv())
\ No newline at end of file
diff --git a/www/services/openalex/utils.py b/www/services/openalex/utils.py
new file mode 100644
index 000000000..9d7a4e443
--- /dev/null
+++ b/www/services/openalex/utils.py
@@ -0,0 +1,8 @@
+def get_count_open_alex(data : dict) -> int:
+    return data["meta"]["count"]
+
+def get_page_open_alex(data : dict) -> str:
+    return data["meta"]["page"]
+
+def get_results_open_alex(data : dict) -> dict:
+    return data["results"]
\ No newline at end of file
diff --git a/www/services/parsers.py b/www/services/parsers.py
index 72b9d370e..c65da921c 100644
--- a/www/services/parsers.py
+++ b/www/services/parsers.py
@@ -1,4 +1,4 @@
-from .utils import *
+from www.services.utils import *
 
 
 #### WEB OF SCIENCE PARSER ####
@@ -38,40 +38,39 @@ def parse_wos_data(datapath):  # PARSER FOR WEB OF SCIENCE TXT and CIW
 
     return elem_data
 
-
 #### PUBMED PARSER ####
 def parse_pubmed_data(datapath):  # PARSER FOR PUBMED TXT
-    data = []
-    current_record = {}
-    
+    data = []   
     with open(datapath, 'r', encoding='utf-8') as file:
-        lines = file.readlines()
+        file_data = file.read()
+    paper_begin_pattern = r'(?=PMID\s*-\s*\d+)'
+    papers = re.split(paper_begin_pattern, file_data)
     
-    for line in lines:
-        # line = line.decode('utf-8')  # Decode the line from bytes to string
-        if line.strip() == '':
-            # If the line is empty, add the current record to the data
-            if current_record:
-                data.append(current_record)
-                current_record = {}
-            continue
-
-        key_match = re.match(r'^([A-Z]+)\s*-\s*(.+)', line)
-        if key_match:
-            key = key_match.group(1)
-            value = key_match.group(2)
-
-            if key in current_record:
-                current_record[key] += ';' + value
+    for paper in papers:
+        current_record = {}
+        lines = paper.split("\n")
+        for line in lines:
+            # line = line.decode('utf-8')  # Decode the line from bytes to string
+            if line.strip() == '':
+                # If the line is empty, skip it
+                continue
+
+            key_match = re.match(r'^([A-Z]+)\s*-\s*(.+)', line)
+            if key_match:
+                key = key_match.group(1)
+                value = key_match.group(2)
+
+                if key in current_record:
+                    current_record[key] += ';' + value
+                else:
+                    current_record[key] = value
             else:
-                current_record[key] = value
-        else:
-            # Add the content to the previous key
-            current_record[key] += ' ' + line.strip()
+                # Add the content to the previous key
+                current_record[key] += ' ' + line.strip()
 
-    # Add the last record if present
-    if current_record:
-        data.append(current_record)
+        # Add the last record if present
+        if current_record:
+            data.append(current_record)
 
     return data
 
diff --git a/www/services/pubmed_api/api_service.py b/www/services/pubmed_api/api_service.py
new file mode 100644
index 000000000..33f0f4b56
--- /dev/null
+++ b/www/services/pubmed_api/api_service.py
@@ -0,0 +1,46 @@
+import requests
+import pandas as pd
+
+PUBMED_SEARCH_ENDPOINT = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+PUBMED_FETCH_ENDPOINT = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+
+def search_pubmed(query, retmax=20):
+    params = {
+        "db": "pubmed",
+        "retmax": retmax,
+        "term": query,
+        "usehistory": "y"
+    }
+    response = requests.get(PUBMED_SEARCH_ENDPOINT, params=params)
+    response.raise_for_status()
+    data = response.text
+    return data
+
+def search_webenv(webenvId, query_key, restart=0, retmax=20):
+    params = {
+        "db": "pubmed",
+        "query_key": query_key,
+        "WebEnv": webenvId,
+        "retstart": restart,
+        "retmax": retmax
+    }
+    response = requests.get(PUBMED_SEARCH_ENDPOINT, params=params)
+    response.raise_for_status()
+    data = response.text
+    return data
+
+
+def fetch_pubmed(webEnv, query_key, restart=0, retmax=100):
+    params = {
+        "db": "pubmed",
+        "query_key": query_key,
+        "WebEnv": webEnv,
+        "retmode": "text",
+        "rettype": "medline",
+        "retstart": restart,
+        "retmax": retmax
+    }
+    response = requests.get(PUBMED_FETCH_ENDPOINT, params=params)
+    response.raise_for_status()
+    data = response.text
+    return data
\ No newline at end of file
diff --git a/www/services/pubmed_api/pagination.py b/www/services/pubmed_api/pagination.py
new file mode 100644
index 000000000..37a47f756
--- /dev/null
+++ b/www/services/pubmed_api/pagination.py
@@ -0,0 +1,30 @@
+from .api_service import fetch_pubmed
+import os
+import pandas as pd
+from www.services.format_functions import process_single_file
+
+# Required columns
+DF_COLUMNS = ["DB", "UT", "DI", "PMID", "TI", "SO", "JI", "PY", "DT", "LA", "TC", "AU", "AF", "C1", "RP", "CR", "DE", "ID", "AB", "VL", "IS", "BP", "EP", "SR"]
+
+def create_df_from_pubmed_data(papers_dict_list):
+    def _remove_uneeded_fields(old_dict):
+        return {k: old_dict[k] for k in DF_COLUMNS if k in old_dict}
+    
+    list_of_dicts = [_remove_uneeded_fields(paper) for paper in papers_dict_list]
+    return pd.DataFrame(list_of_dicts)
+
+# Parsing papers data using the library's parser
+def _get_bibliometrix_parsed_data_file(papers_data):
+    with open("pubmed.txt", "w", encoding='utf-8') as f:
+        f.write(str(papers_data))
+        return process_single_file(os.path.abspath(f.name), "pubmed", ".txt", "fullname")
+
+def get_paginated_papers_df(webEnv, query_key, page=0, retmax=20):
+    if page > 0:
+        page -= 1 # pubmed starts from 0 index
+    restart = page * retmax 
+    papers_data = fetch_pubmed(webEnv, query_key, restart=restart, retmax=retmax)
+    papers_dict_list = _get_bibliometrix_parsed_data_file(papers_data)
+    with open("biblio_parsed.txt", "w") as ff:
+        ff.write(str(papers_dict_list))
+    return create_df_from_pubmed_data(papers_dict_list)
\ No newline at end of file
diff --git a/www/services/pubmed_api/xml_parser.py b/www/services/pubmed_api/xml_parser.py
new file mode 100644
index 000000000..6e08b2c29
--- /dev/null
+++ b/www/services/pubmed_api/xml_parser.py
@@ -0,0 +1,30 @@
+import xml.etree.ElementTree as ET
+
+def _get_ids_from_xml(xml_data):
+    tree = ET.fromstring(xml_data)
+    tree = tree.find("IdList")
+    return [id_elem.text for id_elem in tree.findall("Id")]
+
+def _get_count(xml_data):
+    tree = ET.fromstring(xml_data)
+    tree = tree.find("Count")
+    return int(tree.text)
+
+def _get_webenv(xml_data):
+    tree = ET.fromstring(xml_data)
+    tree = tree.find("WebEnv")
+    return tree.text
+
+def _get_query_key(xml_data):
+    tree = ET.fromstring(xml_data)
+    tree = tree.find("QueryKey")
+    return tree.text
+
+
+def get_data_from_query(xml_data, retmax):    
+    count = _get_count(xml_data)
+    number_pages = (count // retmax) + 1 if count % retmax != 0 else (count // retmax)
+    webEnv = _get_webenv(xml_data)
+    query_key = _get_query_key(xml_data)
+    
+    return number_pages, webEnv, query_key