Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
__pycache__/
bibliovenv/
Bibenv/
.idea/
.idea/.DS_Store
*.bak
all_code.txt
venv/
__pycache__/
*.pyc
*.rda
*.rdata
Empty file added analysis.py
Empty file.
203 changes: 102 additions & 101 deletions app.py

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
extraction:
query: "bibliometrics"
max_results: 50
source: "OPENALEX"

paths:
output_csv: "standardized_output.csv"
log_file: "pipeline.log"
5 changes: 3 additions & 2 deletions functions/get_affiliationproductionovertime.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ def get_affiliation_production_over_time(df, top_k_affiliations):
Returns:
A Plotly figure object representing the affiliation's production over time.
"""
data = df.get()
data = df

AFF = data["AU_UN"].dropna().apply(lambda x: [aff for aff in x if aff.strip() != ""])
AFF_series = data["AU_UN"].fillna("").apply(lambda x: [aff for aff in (x if isinstance(x, list) else str(x).split(";")) if str(aff).strip() not in ["", "nan"]])
AFF = AFF_series
nAFF = [len(aff) for aff in AFF]

affiliations = [aff for sublist in AFF for aff in sublist]
Expand Down
2 changes: 1 addition & 1 deletion functions/get_annualproduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def get_annual_production(df):
Returns:
A Plotly figure object representing the annual scientific production.
"""
data = df.get()
data = df

# Calculate the number of publications per year
publications_per_year = data["PY"].value_counts().sort_index().reset_index()
Expand Down
2 changes: 1 addition & 1 deletion functions/get_authorlocalimpact.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def get_authors_local_impact(df, num_of_authors_local_impact, author_local_impac
Returns:
A Plotly figure object and a DataFrame of the most impactful sources.
"""
df = df.get()
df = df
today = pd.Timestamp.now().year

# Ensure 'TC' and 'PY' are numeric
Expand Down
2 changes: 1 addition & 1 deletion functions/get_authorproductionovertime.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def get_author_production_over_time(df, top_k_authors):
table_authors_production (pd.DataFrame): Table summarizing authors' production with TC and TCpY.
table_documents (pd.DataFrame): Detailed table with additional document information.
"""
data = df.get()
data = df

# Ensure "PY" is numeric
data["PY"] = pd.to_numeric(data["PY"], errors="coerce")
Expand Down
2 changes: 1 addition & 1 deletion functions/get_averagecitations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def get_average_citations(df):
Returns:
A Plotly figure object representing the average citations per year.
"""
data = df.get()
data = df

# Calculate the current year
current_year = pd.Timestamp.now().year + 1
Expand Down
6 changes: 3 additions & 3 deletions functions/get_bradfordlaw.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def get_bradford_law(df):
A Plotly figure object and a DataFrame of the Bradford's Law zones.
"""
# Sort data by frequency of occurrence (equivalent to R's sort(table(M$SO), decreasing = TRUE))
data = df.get()
data = df
source_counts = data["SO"].value_counts()

# Total number of sources
Expand Down Expand Up @@ -67,7 +67,7 @@ def get_bradford_law(df):
fig.add_shape(
type="rect",
x0=0,
x1=np.log(df_bradford["Rank"][a]),
x1=np.log(df_bradford["Rank"].iloc[int(a)-1]),
y0=0,
y1=df_bradford["Freq"].max(),
fillcolor="#B3D1F2",
Expand All @@ -78,7 +78,7 @@ def get_bradford_law(df):

# Add the "Core Sources" annotation with smaller font
fig.add_annotation(
x=np.log(df_bradford["Rank"][a]) / 2,
x=np.log(df_bradford["Rank"].iloc[int(a)-1]) / 2,
y=df_bradford["Freq"].max() * 0.85,
text="<b>Core<br>Sources</b>",
showarrow=False,
Expand Down
126 changes: 126 additions & 0 deletions functions/get_bradfordlaw.py.bak
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from www.services import *


def get_bradford_law(df):
"""
Generate a plot and table based on Bradford's Law.

Args:
df: A DataFrame object containing the data.

Returns:
A Plotly figure object and a DataFrame of the Bradford's Law zones.
"""
# Sort data by frequency of occurrence (equivalent to R's sort(table(M$SO), decreasing = TRUE))
data = df
source_counts = data["SO"].value_counts()

# Total number of sources
n = source_counts.sum()
# Cumulative sum of the frequencies (equivalent to cumsum in R)
cumSO = source_counts.cumsum()

# Define the cut points for Bradford's Law (zones)
cutpoints = [1, n * 0.33, n * 0.67, float('inf')]
groups = pd.cut(cumSO, bins=cutpoints, labels=["Zone 1", "Zone 2", "Zone 3"])

# Find the cut points for "Core" sources
a = (cumSO < n * 0.33).sum() + 1
b = (cumSO < n * 0.67).sum() + 1
Z = ["Zone 1"] * a + ["Zone 2"] * (b - a) + ["Zone 3"] * (len(cumSO) - b)

# Create a DataFrame for Bradford's Law table
df_bradford = pd.DataFrame({
"SO": cumSO.index.str[:25], # Shorten the source names to 25 characters if necessary
"Rank": range(1, len(cumSO) + 1),
"Freq": source_counts.values,
"cumFreq": cumSO.values,
"Zone": Z
})

# Create the Plotly figure
fig = go.Figure()

# Add the line plot without text above the points
fig.add_trace(go.Scatter(
x=np.log(df_bradford["Rank"]),
y=df_bradford["Freq"],
mode='lines+markers',
name='Articles per Source',
marker=dict(
color='#5567BB',
size=10,
line=dict(width=1, color='white'),
opacity=0.95
),
line=dict(color='#5567BB', width=2, shape='spline'),
hovertemplate=(
"<b>Source:</b> %{customdata[0]}<br>"
"<b>Rank:</b> %{x:.2f}<br>"
"<b>N. of Documents:</b> %{y}<br>"
"<b>Zone:</b> %{customdata[1]}<extra></extra>"
),
customdata=np.stack([df_bradford["SO"], df_bradford["Zone"]], axis=-1)
))

# Add the "Core Sources" area with the rectangle
fig.add_shape(
type="rect",
x0=0,
x1=np.log(df_bradford["Rank"][a]),
y0=0,
y1=df_bradford["Freq"].max(),
fillcolor="#B3D1F2",
opacity=0.18,
line_width=0,
layer="below"
)

# Add the "Core Sources" annotation with smaller font
fig.add_annotation(
x=np.log(df_bradford["Rank"][a]) / 2,
y=df_bradford["Freq"].max() * 0.85,
text="<b>Core<br>Sources</b>",
showarrow=False,
font=dict(size=15, color="#5567BB", family="Segoe UI, Arial"),
align="center",
bgcolor="rgba(255,255,255,0.7)",
bordercolor="#B3D1F2",
borderpad=4,
borderwidth=1,
)

# Customize the X axis labels (log scale) with smaller font
fig.update_layout(
xaxis=dict(
title="Source log(Rank)",
tickmode='array',
tickvals=np.log(df_bradford["Rank"][:a]),
ticktext=df_bradford["SO"][:a],
tickangle=90,
showgrid=True,
gridcolor="#F0F0F0",
zeroline=False,
tickfont=dict(size=10),
),
yaxis=dict(
title="N. of Documents",
showgrid=True,
gridcolor="#F0F0F0",
zeroline=False,
tickfont=dict(size=10),
),
plot_bgcolor='white',
font=dict(color="#222222", size=11, family="Segoe UI, Arial"),
margin=dict(l=80, r=40, t=40, b=120),
height=800,
showlegend=False,
hoverlabel=dict(
bgcolor="white",
font_size=11,
font_family="Segoe UI, Arial",
bordercolor="#5567BB"
),
)

return fig, df_bradford
3 changes: 2 additions & 1 deletion functions/get_citedcountries.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure):
"""
# Extract metadata tags for cited countries
df = metaTagExtraction(df, "AU1_CO")
df = df.get()
df = df

# Prepare the table for ranking countries
tab = (
Expand Down Expand Up @@ -100,6 +100,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure):

# Set x-axis ticks
max_x = x_values.max()
max_x = 0 if pd.isna(max_x) else max_x
tick_step = 5 if max_x <= 50 else int(max_x // 10) or 1
x_ticks = list(range(0, int(max_x) + tick_step, tick_step))
if x_ticks[-1] < max_x:
Expand Down
4 changes: 2 additions & 2 deletions functions/get_citeddocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def get_cited_documents(df, num_of_cited_docs, cited_docs_measure):
A Plotly figure object and a DataFrame of the most cited documents.
"""
# Extract metadata tags for cited documents
df = metaTagExtraction(df, "SR")
df = df.get()
if "SR" not in df.columns or (df["SR"] == "").all():
df = metaTagExtraction(df, "SR")

# Prepare the table for ranking documents
current_year = pd.to_datetime("today").year
Expand Down
4 changes: 2 additions & 2 deletions functions/get_co_occurence_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def get_co_occurence_network(df, field_cn, ngram, network_layout, clustering_alg

# Generate layout
# Using default igraph layout
layout = cocnet['graph']['layout']
layout = cocnet['layout']
print("Layout:", layout)
# Get coordinates from layout
coords = np.array([[pos[0], pos[1]] for pos in layout])
Expand Down Expand Up @@ -479,7 +479,7 @@ def field_by_year(df, field_cn, timespan=None, min_freq=2, n_items=5, remove_ter
The field to analyze ('ID', 'DE', 'TI', 'AB', 'WC')
"""
# Get the field data
M = df.get()
M = df

# Create co-occurrence matrix
A = cocMatrix(df, field_cn, binary=False, remove_terms=remove_terms, synonyms=synonyms)
Expand Down
2 changes: 1 addition & 1 deletion functions/get_cocitation.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def get_co_citation(
b = np.random.randint(0, 255)
cluster_colors[cluster_id] = f"rgba({r},{g},{b},0.7)"

layout = cocitnet['graph']['layout']
layout = cocitnet['layout']
coords = np.array([[pos[0], pos[1]] for pos in layout])
coords = coords / np.abs(coords).max()
coords[:, 0] *= 1000
Expand Down
4 changes: 2 additions & 2 deletions functions/get_collaborationnetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def get_collaboration_network(
print("Generating collaboration network...")

M = df
m = df.get()
m = df
NetRefs = None
Title = ""

Expand Down Expand Up @@ -108,7 +108,7 @@ def get_collaboration_network(
b = np.random.randint(0, 255)
cluster_colors[cluster_id] = f"rgba({r},{g},{b},{opacity})"

layout = netplot['graph']['layout']
layout = netplot['layout']
coords = np.array([[pos[0], pos[1]] for pos in layout])
coords = coords / np.abs(coords).max()
coords[:, 0] *= 1000
Expand Down
8 changes: 5 additions & 3 deletions functions/get_correspondingauthorcountries.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ def get_corresponding_author_countries(df, top_k_countries):
A Plotly figure object and a DataFrame of the most common corresponding author countries.
"""
# Estrai i metadati "AU_CO" e "AU1_CO" e verifica il tipo di dati
df = metaTagExtraction(df, Field="AU_CO") # Assumendo che `metaTagExtraction` sia già definita
df = metaTagExtraction(df, Field="AU1_CO")
data = df.get() # Se `df` è un oggetto reattivo
if "AU_CO" not in df.columns or df["AU_CO"].apply(lambda x: isinstance(x, list) and len(x) == 0).all():
df = metaTagExtraction(df, Field="AU_CO")
if "AU1_CO" not in df.columns or (df["AU1_CO"] == "").all():
df = metaTagExtraction(df, Field="AU1_CO")
data = df # Se `df` è un oggetto reattivo

# Assicurati che le colonne siano di tipo stringa e rimuovi righe con valori mancanti
data = data.dropna(subset=["AU1_CO", "AU_CO"])
Expand Down
2 changes: 1 addition & 1 deletion functions/get_countriesproduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def get_countries_production(df):
"""
# Assicurati che i metadati siano stati estratti
df = metaTagExtraction(df, "AU_CO")
df = df.get()
df = df

# Conta le occorrenze dei paesi
df["AU_CO"] = df["AU_CO"].apply(lambda x: x if isinstance(x, list) else [x])
Expand Down
2 changes: 1 addition & 1 deletion functions/get_countriesproductionovertime.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def get_countries_production_over_time(df, top_k_countries):
A Plotly figure object representing the country's production over time.
"""
df = metaTagExtraction(df, "AU_CO")
data = df.get()
data = df

AFF = pd.Series(data["AU_CO"]).dropna().apply(lambda x: [aff.strip() for aff in x if aff.strip() != ""])
nAFF = [len(aff) for aff in AFF]
Expand Down
11 changes: 6 additions & 5 deletions functions/get_factorialanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_factorial_analysis(
# Set ngrams based on word_type
ngrams = int(ngram) if field in ['TI', 'AB'] else 1

M = df.get()
M = df
tab = table_tag(M, field, ngrams)

if len(tab) >= 2:
Expand Down Expand Up @@ -136,8 +136,8 @@ def get_factorial_analysis(

# Verifica che eigCorr esista prima di accedere
if CS["res"] is not None and hasattr(CS["res"], "eigCorr"):
xlabel = f"Dim 1 ({CS['res'].eigCorr['perc'][dimX]:.2f}%)"
ylabel = f"Dim 2 ({CS['res'].eigCorr['perc'][dimY]:.2f}%)"
xlabel = f"Dim 1 ({CS["res"].eigCorr["perc"].iloc[dimX-1] if len(CS["res"].eigCorr["perc"]) > dimX-1 else 0:.2f}%)"
ylabel = f"Dim 2 ({CS['res'].eigCorr['perc'].iloc[dimY-1] if len(CS['res'].eigCorr['perc']) > dimY-1 else 0:.2f}%)"
else:
xlabel, ylabel = "Dim 1", "Dim 2"

Expand All @@ -157,7 +157,8 @@ def get_factorial_analysis(
wordCoord["dotSize"] = wordCoord["dotSize"].replace([np.inf, -np.inf], np.nan)
wordCoord["dotSize"] = wordCoord["dotSize"].fillna(1)
wordCoord["dotSize"] = wordCoord["dotSize"].clip(lower=1)
thres = sorted(wordCoord["dotSize"], reverse=True)[min(int(topWordPlot), len(wordCoord) - 1)]
topWordPlot_safe = min(int(topWordPlot) if np.isfinite(topWordPlot) else len(wordCoord), len(wordCoord) - 1)
thres = sorted(wordCoord["dotSize"], reverse=True)[topWordPlot_safe]
wordCoord["labelToPlot"] = np.where(wordCoord["dotSize"] >= thres, wordCoord["label"], "")

# Avoid label overlapping
Expand Down Expand Up @@ -950,7 +951,7 @@ def factorial(X, method, n_clusters=5, k_max=5):
# Crea la lista `coord`
coord_df = pd.DataFrame({
"Dim1": cpc[:, 0],
"Dim2": cpc[:, 1],
"Dim2": cpc[:, 1] if cpc.shape[1] > 1 else np.zeros(len(cpc)),
"label": levelnames
})
mask = coord_df["label"].str[-2:] == "_1"
Expand Down
Loading