PRAISELab-PicusLab · solmaznazari · Jun 17, 2026
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,11 @@
 __pycache__/
 bibliovenv/
 Bibenv/
-.idea/
+.idea/.DS_Store
+*.bak
+all_code.txt
+venv/
+__pycache__/
+*.pyc
+*.rda
+*.rdata
diff --git a/analysis.py b/analysis.py
diff --git a/app.py b/app.py
diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,8 @@
+extraction:
+  query: "bibliometrics"
+  max_results: 50
+  source: "OPENALEX"
+
+paths:
+  output_csv: "standardized_output.csv"
+  log_file: "pipeline.log"
diff --git a/functions/get_affiliationproductionovertime.py b/functions/get_affiliationproductionovertime.py
@@ -12,9 +12,10 @@ def get_affiliation_production_over_time(df, top_k_affiliations):
     Returns:
         A Plotly figure object representing the affiliation's production over time.
     """
-    data = df.get()
+    data = df
 
-    AFF = data["AU_UN"].dropna().apply(lambda x: [aff for aff in x if aff.strip() != ""])
+    AFF_series = data["AU_UN"].fillna("").apply(lambda x: [aff for aff in (x if isinstance(x, list) else str(x).split(";")) if str(aff).strip() not in ["", "nan"]])
+    AFF = AFF_series
     nAFF = [len(aff) for aff in AFF]
 
     affiliations = [aff for sublist in AFF for aff in sublist]

diff --git a/functions/get_annualproduction.py b/functions/get_annualproduction.py
@@ -11,7 +11,7 @@ def get_annual_production(df):
     Returns:
         A Plotly figure object representing the annual scientific production.
     """
-    data = df.get()
+    data = df
 
     # Calculate the number of publications per year
     publications_per_year = data["PY"].value_counts().sort_index().reset_index()

diff --git a/functions/get_authorlocalimpact.py b/functions/get_authorlocalimpact.py
@@ -13,7 +13,7 @@ def get_authors_local_impact(df, num_of_authors_local_impact, author_local_impac
     Returns:
         A Plotly figure object and a DataFrame of the most impactful sources.
     """
-    df = df.get()
+    df = df
     today = pd.Timestamp.now().year
 
     # Ensure 'TC' and 'PY' are numeric

diff --git a/functions/get_authorproductionovertime.py b/functions/get_authorproductionovertime.py
@@ -16,7 +16,7 @@ def get_author_production_over_time(df, top_k_authors):
         table_authors_production (pd.DataFrame): Table summarizing authors' production with TC and TCpY.
         table_documents (pd.DataFrame): Detailed table with additional document information.
     """
-    data = df.get()
+    data = df
 
     # Ensure "PY" is numeric
     data["PY"] = pd.to_numeric(data["PY"], errors="coerce")

diff --git a/functions/get_averagecitations.py b/functions/get_averagecitations.py
@@ -11,7 +11,7 @@ def get_average_citations(df):
     Returns:
         A Plotly figure object representing the average citations per year.
     """
-    data = df.get()
+    data = df
 
     # Calculate the current year
     current_year = pd.Timestamp.now().year + 1

diff --git a/functions/get_bradfordlaw.py b/functions/get_bradfordlaw.py
@@ -12,7 +12,7 @@ def get_bradford_law(df):
         A Plotly figure object and a DataFrame of the Bradford's Law zones.
     """
     # Sort data by frequency of occurrence (equivalent to R's sort(table(M$SO), decreasing = TRUE))
-    data = df.get()
+    data = df
     source_counts = data["SO"].value_counts()
 
     # Total number of sources
@@ -67,7 +67,7 @@ def get_bradford_law(df):
     fig.add_shape(
         type="rect",
         x0=0,
-        x1=np.log(df_bradford["Rank"][a]),
+        x1=np.log(df_bradford["Rank"].iloc[int(a)-1]),
         y0=0,
         y1=df_bradford["Freq"].max(),
         fillcolor="#B3D1F2",
@@ -78,7 +78,7 @@ def get_bradford_law(df):
 
     # Add the "Core Sources" annotation with smaller font
     fig.add_annotation(
-        x=np.log(df_bradford["Rank"][a]) / 2,
+        x=np.log(df_bradford["Rank"].iloc[int(a)-1]) / 2,
         y=df_bradford["Freq"].max() * 0.85,
         text="<b>Core<br>Sources</b>",
         showarrow=False,

diff --git a/functions/get_bradfordlaw.py.bak b/functions/get_bradfordlaw.py.bak
@@ -0,0 +1,126 @@
+from www.services import *
+
+
+def get_bradford_law(df):
+    """
+    Generate a plot and table based on Bradford's Law.
+
+    Args:
+        df: A DataFrame object containing the data.
+
+    Returns:
+        A Plotly figure object and a DataFrame of the Bradford's Law zones.
+    """
+    # Sort data by frequency of occurrence (equivalent to R's sort(table(M$SO), decreasing = TRUE))
+    data = df
+    source_counts = data["SO"].value_counts()
+
+    # Total number of sources
+    n = source_counts.sum()
+    # Cumulative sum of the frequencies (equivalent to cumsum in R)
+    cumSO = source_counts.cumsum()
+
+    # Define the cut points for Bradford's Law (zones)
+    cutpoints = [1, n * 0.33, n * 0.67, float('inf')]
+    groups = pd.cut(cumSO, bins=cutpoints, labels=["Zone 1", "Zone 2", "Zone 3"])
+
+    # Find the cut points for "Core" sources
+    a = (cumSO < n * 0.33).sum() + 1
+    b = (cumSO < n * 0.67).sum() + 1
+    Z = ["Zone 1"] * a + ["Zone 2"] * (b - a) + ["Zone 3"] * (len(cumSO) - b)
+
+    # Create a DataFrame for Bradford's Law table
+    df_bradford = pd.DataFrame({
+        "SO": cumSO.index.str[:25],  # Shorten the source names to 25 characters if necessary
+        "Rank": range(1, len(cumSO) + 1),
+        "Freq": source_counts.values,
+        "cumFreq": cumSO.values,
+        "Zone": Z
+    })
+
+    # Create the Plotly figure
+    fig = go.Figure()
+
+    # Add the line plot without text above the points
+    fig.add_trace(go.Scatter(
+        x=np.log(df_bradford["Rank"]),
+        y=df_bradford["Freq"],
+        mode='lines+markers',
+        name='Articles per Source',
+        marker=dict(
+            color='#5567BB',
+            size=10,
+            line=dict(width=1, color='white'),
+            opacity=0.95
+        ),
+        line=dict(color='#5567BB', width=2, shape='spline'),
+        hovertemplate=(
+            "<b>Source:</b> %{customdata[0]}<br>"
+            "<b>Rank:</b> %{x:.2f}<br>"
+            "<b>N. of Documents:</b> %{y}<br>"
+            "<b>Zone:</b> %{customdata[1]}<extra></extra>"
+        ),
+        customdata=np.stack([df_bradford["SO"], df_bradford["Zone"]], axis=-1)
+    ))
+
+    # Add the "Core Sources" area with the rectangle
+    fig.add_shape(
+        type="rect",
+        x0=0,
+        x1=np.log(df_bradford["Rank"][a]),
+        y0=0,
+        y1=df_bradford["Freq"].max(),
+        fillcolor="#B3D1F2",
+        opacity=0.18,
+        line_width=0,
+        layer="below"
+    )
+
+    # Add the "Core Sources" annotation with smaller font
+    fig.add_annotation(
+        x=np.log(df_bradford["Rank"][a]) / 2,
+        y=df_bradford["Freq"].max() * 0.85,
+        text="<b>Core<br>Sources</b>",
+        showarrow=False,
+        font=dict(size=15, color="#5567BB", family="Segoe UI, Arial"),
+        align="center",
+        bgcolor="rgba(255,255,255,0.7)",
+        bordercolor="#B3D1F2",
+        borderpad=4,
+        borderwidth=1,
+    )
+
+    # Customize the X axis labels (log scale) with smaller font
+    fig.update_layout(
+        xaxis=dict(
+            title="Source log(Rank)",
+            tickmode='array',
+            tickvals=np.log(df_bradford["Rank"][:a]),
+            ticktext=df_bradford["SO"][:a],
+            tickangle=90,
+            showgrid=True,
+            gridcolor="#F0F0F0",
+            zeroline=False,
+            tickfont=dict(size=10),
+        ),
+        yaxis=dict(
+            title="N. of Documents",
+            showgrid=True,
+            gridcolor="#F0F0F0",
+            zeroline=False,
+            tickfont=dict(size=10),
+        ),
+        plot_bgcolor='white',
+        font=dict(color="#222222", size=11, family="Segoe UI, Arial"),
+        margin=dict(l=80, r=40, t=40, b=120),
+        height=800,
+        showlegend=False,
+        hoverlabel=dict(
+            bgcolor="white",
+            font_size=11,
+            font_family="Segoe UI, Arial",
+            bordercolor="#5567BB"
+        ),
+    )
+
+    return fig, df_bradford
diff --git a/functions/get_citedcountries.py b/functions/get_citedcountries.py
@@ -15,7 +15,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure):
     """
     # Extract metadata tags for cited countries
     df = metaTagExtraction(df, "AU1_CO")
-    df = df.get()
+    df = df
 
     # Prepare the table for ranking countries
     tab = (
@@ -100,6 +100,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure):
 
     # Set x-axis ticks
     max_x = x_values.max()
+    max_x = 0 if pd.isna(max_x) else max_x
     tick_step = 5 if max_x <= 50 else int(max_x // 10) or 1
     x_ticks = list(range(0, int(max_x) + tick_step, tick_step))
     if x_ticks[-1] < max_x:

diff --git a/functions/get_citeddocuments.py b/functions/get_citeddocuments.py
@@ -14,8 +14,8 @@ def get_cited_documents(df, num_of_cited_docs, cited_docs_measure):
         A Plotly figure object and a DataFrame of the most cited documents.
     """
     # Extract metadata tags for cited documents
-    df = metaTagExtraction(df, "SR")
-    df = df.get()
+    if "SR" not in df.columns or (df["SR"] == "").all():
+        df = metaTagExtraction(df, "SR")
 
     # Prepare the table for ranking documents
     current_year = pd.to_datetime("today").year

diff --git a/functions/get_co_occurence_network.py b/functions/get_co_occurence_network.py
@@ -136,7 +136,7 @@ def get_co_occurence_network(df, field_cn, ngram, network_layout, clustering_alg
 
     # Generate layout
     # Using default igraph layout
-    layout = cocnet['graph']['layout']
+    layout = cocnet['layout']
     print("Layout:", layout)
     # Get coordinates from layout
     coords = np.array([[pos[0], pos[1]] for pos in layout])
@@ -479,7 +479,7 @@ def field_by_year(df, field_cn, timespan=None, min_freq=2, n_items=5, remove_ter
         The field to analyze ('ID', 'DE', 'TI', 'AB', 'WC')
     """
     # Get the field data
-    M = df.get()
+    M = df
 
     # Create co-occurrence matrix
     A = cocMatrix(df, field_cn, binary=False, remove_terms=remove_terms, synonyms=synonyms)

diff --git a/functions/get_cocitation.py b/functions/get_cocitation.py
@@ -95,7 +95,7 @@ def get_co_citation(
         b = np.random.randint(0, 255)
         cluster_colors[cluster_id] = f"rgba({r},{g},{b},0.7)"
 
-    layout = cocitnet['graph']['layout']
+    layout = cocitnet['layout']
     coords = np.array([[pos[0], pos[1]] for pos in layout])
     coords = coords / np.abs(coords).max()
     coords[:, 0] *= 1000

diff --git a/functions/get_collaborationnetwork.py b/functions/get_collaborationnetwork.py
@@ -46,7 +46,7 @@ def get_collaboration_network(
     print("Generating collaboration network...")
 
     M = df
-    m = df.get()
+    m = df
     NetRefs = None
     Title = ""
 
@@ -108,7 +108,7 @@ def get_collaboration_network(
         b = np.random.randint(0, 255)
         cluster_colors[cluster_id] = f"rgba({r},{g},{b},{opacity})"
 
-    layout = netplot['graph']['layout']
+    layout = netplot['layout']
     coords = np.array([[pos[0], pos[1]] for pos in layout])
     coords = coords / np.abs(coords).max()
     coords[:, 0] *= 1000

diff --git a/functions/get_correspondingauthorcountries.py b/functions/get_correspondingauthorcountries.py
@@ -13,9 +13,11 @@ def get_corresponding_author_countries(df, top_k_countries):
         A Plotly figure object and a DataFrame of the most common corresponding author countries.
     """
     # Estrai i metadati "AU_CO" e "AU1_CO" e verifica il tipo di dati
-    df = metaTagExtraction(df, Field="AU_CO")  # Assumendo che `metaTagExtraction` sia già definita
-    df = metaTagExtraction(df, Field="AU1_CO")
-    data = df.get()  # Se `df` è un oggetto reattivo
+    if "AU_CO" not in df.columns or df["AU_CO"].apply(lambda x: isinstance(x, list) and len(x) == 0).all():
+        df = metaTagExtraction(df, Field="AU_CO")
+    if "AU1_CO" not in df.columns or (df["AU1_CO"] == "").all():
+        df = metaTagExtraction(df, Field="AU1_CO")
+    data = df  # Se `df` è un oggetto reattivo
 
     # Assicurati che le colonne siano di tipo stringa e rimuovi righe con valori mancanti
     data = data.dropna(subset=["AU1_CO", "AU_CO"])

diff --git a/functions/get_countriesproduction.py b/functions/get_countriesproduction.py
@@ -13,7 +13,7 @@ def get_countries_production(df):
     """
     # Assicurati che i metadati siano stati estratti
     df = metaTagExtraction(df, "AU_CO")
-    df = df.get()
+    df = df
 
     # Conta le occorrenze dei paesi
     df["AU_CO"] = df["AU_CO"].apply(lambda x: x if isinstance(x, list) else [x])

diff --git a/functions/get_countriesproductionovertime.py b/functions/get_countriesproductionovertime.py
@@ -13,7 +13,7 @@ def get_countries_production_over_time(df, top_k_countries):
         A Plotly figure object representing the country's production over time.
     """
     df = metaTagExtraction(df, "AU_CO")
-    data = df.get()
+    data = df
 
     AFF = pd.Series(data["AU_CO"]).dropna().apply(lambda x: [aff.strip() for aff in x if aff.strip() != ""])
     nAFF = [len(aff) for aff in AFF]

diff --git a/functions/get_factorialanalysis.py b/functions/get_factorialanalysis.py
@@ -74,7 +74,7 @@ def get_factorial_analysis(
     # Set ngrams based on word_type
     ngrams = int(ngram) if field in ['TI', 'AB'] else 1
 
-    M = df.get()
+    M = df
     tab = table_tag(M, field, ngrams)
 
     if len(tab) >= 2:
@@ -136,8 +136,8 @@ def get_factorial_analysis(
 
             # Verifica che eigCorr esista prima di accedere
             if CS["res"] is not None and hasattr(CS["res"], "eigCorr"):
-                xlabel = f"Dim 1 ({CS['res'].eigCorr['perc'][dimX]:.2f}%)"
-                ylabel = f"Dim 2 ({CS['res'].eigCorr['perc'][dimY]:.2f}%)"
+                xlabel = f"Dim 1 ({CS["res"].eigCorr["perc"].iloc[dimX-1] if len(CS["res"].eigCorr["perc"]) > dimX-1 else 0:.2f}%)"
+                ylabel = f"Dim 2 ({CS['res'].eigCorr['perc'].iloc[dimY-1] if len(CS['res'].eigCorr['perc']) > dimY-1 else 0:.2f}%)"
             else:
                 xlabel, ylabel = "Dim 1", "Dim 2"
 
@@ -157,7 +157,8 @@ def get_factorial_analysis(
         wordCoord["dotSize"] = wordCoord["dotSize"].replace([np.inf, -np.inf], np.nan)
         wordCoord["dotSize"] = wordCoord["dotSize"].fillna(1)
         wordCoord["dotSize"] = wordCoord["dotSize"].clip(lower=1)
-        thres = sorted(wordCoord["dotSize"], reverse=True)[min(int(topWordPlot), len(wordCoord) - 1)]
+        topWordPlot_safe = min(int(topWordPlot) if np.isfinite(topWordPlot) else len(wordCoord), len(wordCoord) - 1)
+        thres = sorted(wordCoord["dotSize"], reverse=True)[topWordPlot_safe]
         wordCoord["labelToPlot"] = np.where(wordCoord["dotSize"] >= thres, wordCoord["label"], "")
 
         # Avoid label overlapping
@@ -950,7 +951,7 @@ def factorial(X, method, n_clusters=5, k_max=5):
         # Crea la lista `coord`
         coord_df = pd.DataFrame({
             "Dim1": cpc[:, 0],
-            "Dim2": cpc[:, 1],
+            "Dim2": cpc[:, 1] if cpc.shape[1] > 1 else np.zeros(len(cpc)),
             "label": levelnames
         })
         mask = coord_df["label"].str[-2:] == "_1"