Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
.DS_Store
__pycache__/
biblio_parsed.txt
pubmed.txt
pubmed_data.csv
pubmed_data.csv
bibliovenv/
Bibenv/
.idea/
.idea/
.venv
data/
480 changes: 451 additions & 29 deletions app.py

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@
from .get_thematicevolution import *
from .get_cocitation import *
from .get_collaborationnetwork import *
from .get_worldmapcollaboration import *
from .get_worldmapcollaboration import *
from .save_api_results_to_file import *
from .load_df_from_parquet import *
57 changes: 36 additions & 21 deletions functions/get_database.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from www.services import *


Expand All @@ -11,27 +12,41 @@ def get_database(input):
Returns:
A string representing the name of the database.
"""
if input.select() == "1A": # Bibliographic databases
database = ''
try:
if input.select() == "1A": # Bibliographic databases

database = input.database()

if database == "wos":
database = "Web of Science"
elif database == "scopus":
database = "Scopus"
elif database == "dimensions":
database = "Dimensions"
elif database == "lens":
database = "Lens.org"
elif database == "pubmed":
database = "PubMed"
elif database == "cochrane":
database = "Cochrane Library"

database = input.database()
elif input.select() == "1B": # Bibliometrix database
database = "Bibliometrix"

if database == "wos":
database = "Web of Science"
elif database == "scopus":
database = "Scopus"
elif database == "dimensions":
database = "Dimensions"
elif database == "lens":
database = "Lens.org"
elif database == "pubmed":
database = "PubMed"
elif database == "cochrane":
database = "Cochrane Library"

elif input.select() == "1B": # Bibliometrix database
database = "Bibliometrix"

elif input.select() == "1C": # Sample database
database = "Sample"

elif input.select() == "1C": # Sample database
database = "Sample"

elif input.select() == "1E":
database = "Local parquet file"

elif input.api_select() in ['pubmed', 'openalex']:
database_name = input.api_select()
if database_name == 'pubmed':
database = 'PubMed'
if database_name == 'openalex':
database = 'OpenAlex'
except Exception as e:
logging.error(f'Error: \n{e}\n\n')

return database
7 changes: 6 additions & 1 deletion functions/get_localcitedsources.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from www.services import *

import numpy as np
import logging
import pandas as pd

def get_local_cited_sources(df, num_of_cited_sources):
"""
Expand Down Expand Up @@ -99,6 +101,9 @@ def wrap_label(label, width=50):

# Set x-axis ticks to 0, 50, 100, etc.
max_x = source_counts["N. of Local Citations"].max()
logging.debug(f'get_localCitedSources.py - max_x value= "{max_x}", max_x type: "{type(max_x)}"')
if pd.isna(max_x):
max_x = 10
tick_step = 50
x_ticks = list(range(0, int(max_x) + tick_step, tick_step))
if x_ticks[-1] < max_x:
Expand Down
20 changes: 18 additions & 2 deletions functions/get_table.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from www.services import *
from functions.get_status import *
import logging


# Function to create a Plotly table visualization for metadata completeness
Expand Down Expand Up @@ -78,8 +79,11 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
Returns:
A DataTable object if data is available, otherwise a message indicating no data.
"""
logging.debug('get_table - Invoked "get_table()"')
# Retrieve the data from the DataFrame
data = df.get()
logging.debug(f'data: \n{data}')


table_html = ""
fig = None
Expand Down Expand Up @@ -125,8 +129,17 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
}

# Count missing values (NaN), empty strings, and empty lists in each column
missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + (
data.map(lambda x: x == [])).sum()
def is_empty_string(x):
# If it's a list, numpy array, or dictionary, it's not an empty string
if hasattr(x, '__len__') and not isinstance(x, (str, bytes)):
return False
return x in ["", " "]

is_missing = data.isna() | data.map(is_empty_string)
missing_counts = is_missing.sum()

# missing_counts = data.isna().sum() + (data == "").sum() + (data == " ").sum() + (
# data.map(lambda x: x == [])).sum()

# Calculate the percentage of missing values for each column
missing_percentage = (missing_counts / total_rows) * 100
Expand All @@ -149,6 +162,9 @@ def get_table(database, df, dpi=300, filter=False, modal=True):
# Create and return the Plotly table
fig = create_plotly_table(sorted_columns, dpi)

logging.debug('get_table - FIG OBJECT')
logging.debug(fig)

# HTML table header
table_header = """
<table style="width:100%; border-collapse: collapse;">
Expand Down
22 changes: 22 additions & 0 deletions functions/load_df_from_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pandas as pd
import logging

def _clean_numpy_collections(df: pd.DataFrame) -> pd.DataFrame:
"""Converts any embedded numpy arrays inside object columns back to native python lists."""
import numpy as np

# Target only 'object' columns which hold strings, lists, or arrays
object_cols = df.select_dtypes(include=['object']).columns

for col in object_cols:
# Check if the column contains any numpy arrays
if df[col].apply(lambda x: isinstance(x, np.ndarray)).any():
df[col] = df[col].apply(lambda x: list(x) if isinstance(x, np.ndarray) else x)

return df

def load_df_from_parquet(filepath):
df_local = pd.read_parquet(filepath)
df_local = _clean_numpy_collections(df_local)
logging.debug(f'load_df_from_parquet - {df_local}')
return df_local
71 changes: 71 additions & 0 deletions functions/save_api_results_to_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from pathlib import Path
import pandas as pd
import logging

def save_api_results_to_file(folder_path: str, file_name: str, df_to_save: pd.DataFrame) -> bool:
"""
Function to save the results of an API call to local storage. The saved files are grouped by "query name".

Returns:
bool: True if ok, False if error
"""
if not isinstance(folder_path, str):
logging.error(f'Arg "folder_path" is of type "{type(folder_path)}", expected a str')
return False
if not isinstance(file_name, str):
logging.error(f'Arg "file_name" is of type "{type(file_name)}", expected a str')
return False
if not isinstance(df_to_save, pd.DataFrame):
logging.error(f'Arg "df_to_save" is of type "{type(df_to_save)}", expected a pd.DataFrame')
return False

logging.debug(df_to_save.info(verbose=True))

dir_path = Path(folder_path)
dir_path.mkdir(parents=True, exist_ok=True)

file_path = dir_path / file_name

if not file_path.exists():
logging.warning(f"File not found. Initializing a new dataset for {file_path}...")
df_local = df_to_save.iloc[0:0].copy()
else:
try:
df_local = pd.read_parquet(file_path)
logging.debug(df_local.info(verbose=True))
except Exception as e:
logging.exception(f"Error reading the file: {file_path}")
return False


if set(df_local.columns) == set(df_to_save.columns):
logging.info("Structures match. Checking for new rows...")

combined = pd.concat([df_local, df_to_save], ignore_index=True)

merged = combined.drop_duplicates(subset=["UT"], keep="last")

new_rows_count = len(merged) - len(df_local)

logging.debug(merged.info(verbose=True))
logging.debug(merged)
logging.debug(new_rows_count)

if new_rows_count > 0:
try:
merged.to_parquet(file_path)
logging.info(f"Successfully added {new_rows_count} new rows to the file.")
except Exception as e:
logging.error('Something went wrong. Did not update file')
logging.error(str(e))
return False
else:
logging.info("No new rows to add. File is up to date.")

else:
logging.error(
"Structure mismatch! The file structure does not match the DataFrame in memory."
)
return False

return True
Binary file modified requirements.txt
Binary file not shown.
18 changes: 12 additions & 6 deletions www/services/format_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,11 @@ def format_di_column(entry, source, file_type): # Function for DI Column
doi = entry.get('DI', [''])[0]
elif source == 'PubMed':
if file_type == '.txt':
doi = entry.get('LID', '')
doi_pattern = r'(.+?)\s*\[doi\]$'
doi_raw = entry.get('LID', '')
for doi_value in doi_raw.split(";"):
if re.match(doi_pattern, doi_value):
return doi_value.replace('[doi]', '').strip()
elif source == 'Scopus':
if file_type == '.bib':
doi = entry.get('doi', '')
Expand Down Expand Up @@ -993,6 +997,8 @@ def format_py_column(entry, source, file_type): # Function for PY Column
if file_type == '.txt':
publication_year = entry.get('DP', '')
publication_year = re.findall(r'\d{4}', publication_year)[0] if publication_year else ''
if publication_year != '':
publication_year = int(publication_year) if publication_year.isdigit() else 0
elif source == 'Scopus':
if file_type == '.bib':
publication_year = str(entry.get('year', ''))
Expand Down Expand Up @@ -1627,11 +1633,11 @@ def process_single_file(data, source, file_type, author):
if column not in entry_data: # Avoid overwriting existing keys
entry_data[column] = entry.get(column, None)

# Remove the column based on the value of the 'author' field
if author == "surname":
entry_data.pop('AF', None) # Remove 'AF' if it exists
elif author == "fullname":
entry_data.pop('AU', None) # Remove 'AU' if it exists
# # Remove the column based on the value of the 'author' field
# if author == "surname":
# entry_data.pop('AF', None) # Remove 'AF' if it exists
# elif author == "fullname":
# entry_data.pop('AU', None) # Remove 'AU' if it exists

entries.append(entry_data)

Expand Down
6 changes: 6 additions & 0 deletions www/services/openalex/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import nltk
import logging

logging.basicConfig(level=logging.DEBUG)
logging.getLogger('parser').setLevel(logging.WARNING)
nltk.download('wordnet')
46 changes: 46 additions & 0 deletions www/services/openalex/api_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import requests
from .utils import *
import pandas as pd
from .parser import transform_from_open_alex

OPEN_ALEX_KEY = "wi0R0MWb5Dy1mtZv0OMMn5"
OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?api_key={OPEN_ALEX_KEY}&search="
#OPEN_ALEX_ENDPOINT = f"https://api.openalex.org/works?search="

def search_open_alex(query_str: str, current_page:int =1, works_per_page:int =20) -> list[dict]:
"""Contact the openalex API to retrieve information about the desired topic in JSON format

Args:
query_str (str): Openalex Query Search Parameter
current_page (int): Current page
works_per_page (int): Number of works per page

Returns:
list[dict]: Openalex results organized in a list of dicts, ready to be transformed in a DataFrame
"""
if not isinstance(query_str, str):
raise ValueError(f'Expecting a str for "query_str", got "{type(query_str)}"')
if not isinstance(current_page, int) or current_page<=0:
raise ValueError(f'Expected a positive int for "current_page", got "{type(current_page)}"')
if not isinstance(works_per_page, int) or works_per_page<=0:
raise ValueError(f'Expected a positive int for "works_per_page", got "{type(works_per_page)}"')

full_request = f"{OPEN_ALEX_ENDPOINT}{query_str}&per_page={works_per_page}&page={current_page}"

try:
response = requests.get(full_request)
response.raise_for_status()
data = response.json()
return data
except Exception as e:
raise e

def test():
query = "multiverse"
data = search_open_alex(query)
results = get_results_open_alex(data)
df = pd.DataFrame(results)


# if __name__ == "__main__":
# test()
Loading