PRAISELab-PicusLab · nixxdd · Jun 17, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,11 @@
+.DS_Store
 __pycache__/
+biblio_parsed.txt
+pubmed.txt
+pubmed_data.csv
+pubmed_data.csv
 bibliovenv/
 Bibenv/
-.idea/
+.idea/
+.venv
+data/
diff --git a/app.py b/app.py
diff --git a/functions/__init__.py b/functions/__init__.py
@@ -40,4 +40,6 @@
 from .get_thematicevolution import *
 from .get_cocitation import *
 from .get_collaborationnetwork import *
-from .get_worldmapcollaboration import *
+from .get_worldmapcollaboration import *
+from .save_api_results_to_file import *
+from .load_df_from_parquet import *
diff --git a/functions/get_database.py b/functions/get_database.py
@@ -1,3 +1,4 @@
+import logging
 from www.services import *
 
 
@@ -11,27 +12,41 @@ def get_database(input):
     Returns:
         A string representing the name of the database.
     """
-    if input.select() == "1A":  # Bibliographic databases
+    database = ''
+    try:
+        if input.select() == "1A":  # Bibliographic databases
+
+            database = input.database()
+
+            if database == "wos":
+                database = "Web of Science"
+            elif database == "scopus":
+                database = "Scopus"
+            elif database == "dimensions":
+                database = "Dimensions"
+            elif database == "lens":
+                database = "Lens.org"
+            elif database == "pubmed":
+                database = "PubMed"
+            elif database == "cochrane":
+                database = "Cochrane Library"
 
-        database = input.database()
+        elif input.select() == "1B":  # Bibliometrix database
+            database = "Bibliometrix"
 
-        if database == "wos":
-            database = "Web of Science"
-        elif database == "scopus":
-            database = "Scopus"
-        elif database == "dimensions":
-            database = "Dimensions"
-        elif database == "lens":
-            database = "Lens.org"
-        elif database == "pubmed":
-            database = "PubMed"
-        elif database == "cochrane":
-            database = "Cochrane Library"
-
-    elif input.select() == "1B":  # Bibliometrix database
-        database = "Bibliometrix"
-
-    elif input.select() == "1C":  # Sample database
-        database = "Sample"
-
+        elif input.select() == "1C":  # Sample database
+            database = "Sample"
+
+        elif input.select() == "1E":
+            database = "Local parquet file"
+
+        elif input.api_select() in ['pubmed', 'openalex']:
+            database_name = input.api_select()
+            if database_name == 'pubmed':
+                database = 'PubMed'
+            if database_name == 'openalex':
+                database = 'OpenAlex'
+    except Exception as e:
+        logging.error(f'Error: \n{e}\n\n')
+
     return database
diff --git a/functions/get_localcitedsources.py b/functions/get_localcitedsources.py
@@ -1,5 +1,7 @@
 from www.services import *
-
+import numpy as np
+import logging
+import pandas as pd
 
 def get_local_cited_sources(df, num_of_cited_sources):
     """
@@ -99,6 +101,9 @@ def wrap_label(label, width=50):
 
     # Set x-axis ticks to 0, 50, 100, etc.
     max_x = source_counts["N. of Local Citations"].max()
+    logging.debug(f'get_localCitedSources.py - max_x value= "{max_x}", max_x type: "{type(max_x)}"')
+    if pd.isna(max_x):
+        max_x = 10
     tick_step = 50
     x_ticks = list(range(0, int(max_x) + tick_step, tick_step))
     if x_ticks[-1] < max_x:

diff --git a/functions/get_table.py b/functions/get_table.py
@@ -1,5 +1,6 @@
 from www.services import *
 from functions.get_status import *
+import logging
 
 
 # Function to create a Plotly table visualization for metadata completeness
@@ -78,8 +79,11 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
     Returns:
         A DataTable object if data is available, otherwise a message indicating no data.
     """
+    logging.debug('get_table - Invoked "get_table()"')
     # Retrieve the data from the DataFrame
     data = df.get()
+    logging.debug(f'data: \n{data}')
+
 
     table_html = ""
     fig = None
@@ -125,8 +129,17 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
         }
 
         # Count missing values (NaN), empty strings, and empty lists in each column
-        missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + (
-            data.map(lambda x: x == [])).sum()
+        def is_empty_string(x):
+            # If it's a list, numpy array, or dictionary, it's not an empty string
+            if hasattr(x, '__len__') and not isinstance(x, (str, bytes)):
+                return False
+            return x in ["", " "]
+
+        is_missing = data.isna() | data.map(is_empty_string)
+        missing_counts = is_missing.sum()
+
+        # missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + (
+        #     data.map(lambda x: x == [])).sum()
 
         # Calculate the percentage of missing values for each column
         missing_percentage = (missing_counts / total_rows) * 100
@@ -149,6 +162,9 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
         # Create and return the Plotly table
         fig = create_plotly_table(sorted_columns, dpi)
 
+        logging.debug('get_table - FIG OBJECT')
+        logging.debug(fig)
+
         # HTML table header
         table_header = """
         <table style="width:100%; border-collapse: collapse;">

diff --git a/functions/load_df_from_parquet.py b/functions/load_df_from_parquet.py
@@ -0,0 +1,22 @@
+import pandas as pd
+import logging
+
+def _clean_numpy_collections(df: pd.DataFrame) -> pd.DataFrame:
+    """Converts any embedded numpy arrays inside object columns back to native python lists."""
+    import numpy as np
+
+    # Target only 'object' columns which hold strings, lists, or arrays
+    object_cols = df.select_dtypes(include=['object']).columns
+
+    for col in object_cols:
+        # Check if the column contains any numpy arrays
+        if df[col].apply(lambda x: isinstance(x, np.ndarray)).any():
+            df[col] = df[col].apply(lambda x: list(x) if isinstance(x, np.ndarray) else x)
+
+    return df
+
+def load_df_from_parquet(filepath):
+    df_local = pd.read_parquet(filepath)
+    df_local = _clean_numpy_collections(df_local)
+    logging.debug(f'load_df_from_parquet - {df_local}')
+    return df_local
diff --git a/functions/save_api_results_to_file.py b/functions/save_api_results_to_file.py
@@ -0,0 +1,71 @@
+from pathlib import Path
+import pandas as pd
+import logging
+
+def save_api_results_to_file(folder_path: str, file_name: str, df_to_save: pd.DataFrame) -> bool:
+    """
+    Function to save the results of an API call to local storage. The saved files are grouped by "query name".
+
+    Returns:
+        bool: True if ok, False if error
+    """
+    if not isinstance(folder_path, str):
+        logging.error(f'Arg "folder_path" is of type "{type(folder_path)}", expected a str')
+        return False
+    if not isinstance(file_name, str):
+        logging.error(f'Arg "file_name" is of type "{type(file_name)}", expected a str')
+        return False
+    if not isinstance(df_to_save, pd.DataFrame):
+        logging.error(f'Arg "df_to_save" is of type "{type(df_to_save)}", expected a pd.DataFrame')
+        return False
+
+    logging.debug(df_to_save.info(verbose=True))
+
+    dir_path = Path(folder_path)
+    dir_path.mkdir(parents=True, exist_ok=True)
+
+    file_path = dir_path / file_name
+
+    if not file_path.exists():
+        logging.warning(f"File not found. Initializing a new dataset for {file_path}...")
+        df_local = df_to_save.iloc[0:0].copy()
+    else:
+        try:
+            df_local = pd.read_parquet(file_path)
+            logging.debug(df_local.info(verbose=True))
+        except Exception as e:
+            logging.exception(f"Error reading the file: {file_path}")
+            return False
+
+
+    if set(df_local.columns) == set(df_to_save.columns):
+        logging.info("Structures match. Checking for new rows...")
+
+        combined = pd.concat([df_local, df_to_save], ignore_index=True)
+
+        merged = combined.drop_duplicates(subset=["UT"], keep="last")
+
+        new_rows_count = len(merged) - len(df_local)
+
+        logging.debug(merged.info(verbose=True))
+        logging.debug(merged)
+        logging.debug(new_rows_count)
+
+        if new_rows_count > 0:
+            try:
+                merged.to_parquet(file_path)
+                logging.info(f"Successfully added {new_rows_count} new rows to the file.")
+            except Exception as e:
+                logging.error('Something went wrong. Did not update file')
+                logging.error(str(e))
+                return False
+        else:
+            logging.info("No new rows to add. File is up to date.")
+
+    else:
+        logging.error(
+            "Structure mismatch! The file structure does not match the DataFrame in memory."
+        )
+        return False
+
+    return True
diff --git a/requirements.txt b/requirements.txt
diff --git a/www/services/format_functions.py b/www/services/format_functions.py
@@ -523,7 +523,11 @@ def format_di_column(entry, source, file_type):         # Function for DI Column
             doi = entry.get('DI', [''])[0]
     elif source == 'PubMed':
         if file_type == '.txt':
-            doi = entry.get('LID', '')
+            doi_pattern = r'(.+?)\s*\[doi\]$'
+            doi_raw = entry.get('LID', '')
+            for doi_value in doi_raw.split(";"):
+                if re.match(doi_pattern, doi_value):
+                    return doi_value.replace('[doi]', '').strip()
     elif source == 'Scopus':
         if file_type == '.bib':
             doi = entry.get('doi', '')
@@ -993,6 +997,8 @@ def format_py_column(entry, source, file_type):         # Function for PY Column
         if file_type == '.txt':
             publication_year = entry.get('DP', '')
             publication_year = re.findall(r'\d{4}', publication_year)[0] if publication_year else ''
+            if publication_year != '':
+                publication_year = int(publication_year) if publication_year.isdigit() else 0
     elif source == 'Scopus':
         if file_type == '.bib':
             publication_year = str(entry.get('year', ''))
@@ -1627,11 +1633,11 @@ def process_single_file(data, source, file_type, author):
             if column not in entry_data:  # Avoid overwriting existing keys
                 entry_data[column] = entry.get(column, None)
 
-        # Remove the column based on the value of the 'author' field
-        if author == "surname":
-            entry_data.pop('AF', None)  # Remove 'AF' if it exists
-        elif author == "fullname":
-            entry_data.pop('AU', None)  # Remove 'AU' if it exists
+        # # Remove the column based on the value of the 'author' field
+        # if author == "surname":
+        #     entry_data.pop('AF', None)  # Remove 'AF' if it exists
+        # elif author == "fullname":
+        #     entry_data.pop('AU', None)  # Remove 'AU' if it exists
 
         entries.append(entry_data)
 

diff --git a/www/services/openalex/__init__.py b/www/services/openalex/__init__.py
@@ -0,0 +1,6 @@
+import nltk
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+logging.getLogger('parser').setLevel(logging.WARNING)
+nltk.download('wordnet')
diff --git a/www/services/openalex/api_service.py b/www/services/openalex/api_service.py
@@ -0,0 +1,46 @@
+import requests
+from .utils import *
+import pandas as pd
+from .parser import transform_from_open_alex
+
+OPEN_ALEX_KEY = "wi0R0MWb5Dy1mtZv0OMMn5"
+OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?api_key={OPEN_ALEX_KEY}&search="
+#OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?search="
+
+def search_open_alex(query_str: str, current_page:int =1, works_per_page:int =20) -> list[dict]:
+    """Contact the openalex API to retrieve information about the desired topic in JSON format
+
+    Args:
+        query_str (str): Openalex Query Search Parameter
+        current_page (int): Current page
+        works_per_page (int): Number of works per page
+
+    Returns:
+        list[dict]: Openalex results organized in a list of dicts, ready to be transformed in a DataFrame
+        """
+    if not isinstance(query_str, str):
+        raise ValueError(f'Expecting a str for "query_str", got "{type(query_str)}"')
+    if not isinstance(current_page, int) or current_page<=0:
+        raise ValueError(f'Expected a positive int for "current_page", got "{type(current_page)}"')
+    if not isinstance(works_per_page, int) or works_per_page<=0:
+        raise ValueError(f'Expected a positive int for "works_per_page", got "{type(works_per_page)}"')
+
+    full_request = f"{OPEN_ALEX_ENDPOINT}{query_str}&per_page={works_per_page}&page={current_page}"
+
+    try:
+        response = requests.get(full_request)
+        response.raise_for_status()
+        data = response.json()
+        return data
+    except Exception as e:
+        raise e
+
+def test():
+    query = "multiverse"
+    data = search_open_alex(query)
+    results = get_results_open_alex(data)
+    df = pd.DataFrame(results)
+
+
+# if __name__ == "__main__":
+#     test()