diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 000000000..ac04fb02c
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
index 23b99e089..eda5fea88 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,11 @@
__pycache__/
bibliovenv/
Bibenv/
-.idea/
\ No newline at end of file
+.idea/.DS_Store
+*.bak
+all_code.txt
+venv/
+__pycache__/
+*.pyc
+*.rda
+*.rdata
diff --git a/analysis.py b/analysis.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/app.py b/app.py
index f0891f894..ef94f8a63 100644
--- a/app.py
+++ b/app.py
@@ -667,7 +667,8 @@ def select_db():
"dimensions": "Dimensions",
"lens": "Lens.org",
"pubmed": "PubMed",
- "cochrane": "Cochrane Library"
+ "cochrane": "Cochrane Library",
+ "openalex": "OpenAlex"
},
)
ui.input_select(
@@ -868,7 +869,7 @@ def indicator_types_ui_all():
@reactive.calc
def filters():
- return get_filters(df)
+ return get_filters(df.get())
with ui.layout_sidebar(fillable=False, fill=False):
# Sidebar for data import options
@@ -886,9 +887,9 @@ def show_filter():
"Select Year Range",
sep="",
ticks=True,
- min=data["Min_Year"][0],
- max=data["Max_Year"][0],
- value=(data["Min_Year"][0], data["Max_Year"][0]),
+ min=data["Min_Year"].iloc[0],
+ max=data["Max_Year"].iloc[0],
+ value=(data["Min_Year"].iloc[0], data["Max_Year"].iloc[0]),
step=1,
time_format="YYYY"
)
@@ -988,7 +989,7 @@ def show_main_information_report():
"Average citations per doc"
],
"Value": [
- f"{data['Min_Year'][0]} - {data['Max_Year'][0]}",
+ f"{data['Min_Year'].iloc[0]} - {data['Max_Year'].iloc[0]}",
data['SO'].nunique(),
len(data),
data['CAGR'][0],
@@ -1060,7 +1061,7 @@ def loading_modal():
return ui.HTML(str(modal) + js)
ui.modal_show(loading_modal())
try:
- result = get_main_informations(df)
+ result = get_main_informations(df.get())
return result
finally:
ui.modal_remove()
@@ -1077,7 +1078,7 @@ def show_informations():
with ui.value_box(showcase=ICONS["timespan"], theme="bg-gradient-blue-purple"):
"Timespan"
ui.h2(
- f"{data['Min_Year'][0]} - {data['Max_Year'][0]}"
+ f"{data['Min_Year'].iloc[0]} - {data['Max_Year'].iloc[0]}"
)
with ui.value_box(showcase=ICONS["sources"], theme="bg-gradient-blue-purple"):
"Sources"
@@ -1160,7 +1161,7 @@ def table_informations():
"Average citations per doc"
],
"Value": [
- f"{data['Min_Year'][0]} - {data['Max_Year'][0]}",
+ f"{data['Min_Year'].iloc[0]} - {data['Max_Year'].iloc[0]}",
data['SO'].nunique(),
len(data),
data['CAGR'][0],
@@ -1174,7 +1175,7 @@ def table_informations():
data['Average_Citations_per_Doc'][0]
]
})
- return ui.HTML(DT(df_box, style="width=100%;"))
+ return ui.HTML(DT(df_box, style="width:100%;"))
# --- Annual Scientific Production Section ---
with ui.nav_panel("None", value="annual_scientific_production"):
@@ -1215,7 +1216,7 @@ def show_annual_production_report():
with ui.card(full_screen=True):
@reactive.calc
def annual_informations():
- return get_annual_production(df)
+ return get_annual_production(df.get())
with ui.navset_underline(id="annual_tab"):
with ui.nav_panel("Plot"):
@@ -1228,7 +1229,7 @@ def show_annual_production():
@render.ui
def table_annual_production():
_, publications_per_year = annual_informations()
- return ui.HTML(DT(publications_per_year, style="width=100%;"))
+ return ui.HTML(DT(publications_per_year, style="width:100%;"))
# AI bot Gemini Chat Integration
# --- Floating Chat Button ---
@@ -1369,7 +1370,7 @@ def show_average_citations_report():
with ui.card(full_screen=True):
@reactive.calc
def average_citations():
- return get_average_citations(df)
+ return get_average_citations(df.get())
with ui.navset_underline(id="average_tab"):
with ui.nav_panel("Plot"):
@@ -1382,7 +1383,7 @@ def show_average_citations():
@render.ui
def table_average_citations():
_, avg_citations = average_citations()
- return ui.HTML(DT(avg_citations, style="width=100%;"))
+ return ui.HTML(DT(avg_citations, style="width:100%;"))
# --- Three-Field Plot Section ---
with ui.nav_panel("None", value="three_field_plot"):
@@ -1467,7 +1468,7 @@ def calculate_three_field_plot():
middle_field_items = input.middle_field_items()
right_field_items = input.right_field_items()
- result = get_three_field_plot(df, left_field, middle_field, right_field, left_field_items, middle_field_items, right_field_items)
+ result = get_three_field_plot(df.get(), left_field, middle_field, right_field, left_field_items, middle_field_items, right_field_items)
three_field_plot_results.set(result)
finally:
ui.modal_remove()
@@ -1601,7 +1602,7 @@ def loading_modal():
ui.modal_show(loading_modal())
try:
num_of_sources = input.num_of_sources()
- result = get_relevant_sources(df, num_of_sources)
+ result = get_relevant_sources(df.get(), num_of_sources)
relevant_sources_results.set(result)
finally:
ui.modal_remove()
@@ -1636,7 +1637,7 @@ def table_relevant_sources():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, relevant_sources_tab = result
- return ui.HTML(DT(relevant_sources_tab, style="width=100%;"))
+ return ui.HTML(DT(relevant_sources_tab, style="width:100%;"))
# --- Most Local Cited Sources Section ---
with ui.nav_panel("None", value="most_local_cited_sources"):
@@ -1745,7 +1746,7 @@ def loading_modal():
ui.modal_show(loading_modal())
try:
num_of_cited_sources = input.num_of_cited_sources()
- result = get_local_cited_sources(df, num_of_cited_sources)
+ result = get_local_cited_sources(df.get(), num_of_cited_sources)
local_cited_sources_results.set(result)
finally:
ui.modal_remove()
@@ -1780,7 +1781,7 @@ def table_local_cited_sources():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, local_cited_sources_tab = result
- return ui.HTML(DT(local_cited_sources_tab, style="width=100%;"))
+ return ui.HTML(DT(local_cited_sources_tab, style="width:100%;"))
# --- Bradford's Law Section ---
with ui.nav_panel("None", value="bradfords_law"):
@@ -1821,7 +1822,7 @@ def show_bradfords_law_report():
with ui.card(full_screen=True):
@reactive.calc
def bradford_law():
- return get_bradford_law(df)
+ return get_bradford_law(df.get())
with ui.navset_underline(id="bradford_law_tab"):
with ui.nav_panel("Plot"):
@@ -1834,7 +1835,7 @@ def show_bradford_law():
@render.ui
def table_bradford_law():
_, bradford_law_tab = bradford_law()
- return ui.HTML(DT(bradford_law_tab, style="width=100%;"))
+ return ui.HTML(DT(bradford_law_tab, style="width:100%;"))
# --- Sources' Local Impact Section ---
with ui.nav_panel("None", value="sources_local_impact"):
@@ -1945,7 +1946,7 @@ def loading_modal():
try:
num_of_sources_local_impact = input.num_of_sources_local_impact()
source_local_impact = input.source_local_impact()
- result = get_sources_local_impact(df, num_of_sources_local_impact, source_local_impact)
+ result = get_sources_local_impact(df.get(), num_of_sources_local_impact, source_local_impact)
sources_local_impact_results.set(result)
finally:
ui.modal_remove()
@@ -1980,7 +1981,7 @@ def table_sources_local_impact():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, sources_local_impact_tab = result
- return ui.HTML(DT(sources_local_impact_tab, style="width=100%;"))
+ return ui.HTML(DT(sources_local_impact_tab, style="width:100%;"))
# --- Sources' Production ---
with ui.nav_panel("None", value="sources_production"):
@@ -2080,7 +2081,7 @@ def loading_modal():
try:
num_of_sources_production = input.num_of_sources_production()
occurences = input.occurences()
- result = get_sources_production(df, num_of_sources_production, occurences)
+ result = get_sources_production(df.get(), num_of_sources_production, occurences)
sources_production_result.set(result)
finally:
ui.modal_remove()
@@ -2126,7 +2127,7 @@ def table_sources_production():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, sources_production_tab = result
- return ui.HTML(DT(sources_production_tab, style="width=100%;"))
+ return ui.HTML(DT(sources_production_tab, style="width:100%;"))
# --- Most Relevant Authors Section ---
with ui.nav_panel("None", value="most_relevant_authors"):
@@ -2227,7 +2228,7 @@ def loading_modal():
try:
num_of_authors = input.num_of_authors()
frequency = input.frequency()
- result = get_relevant_authors(df, num_of_authors, frequency)
+ result = get_relevant_authors(df.get(), num_of_authors, frequency)
relevant_authors_result.set(result)
finally:
ui.modal_remove()
@@ -2273,7 +2274,7 @@ def table_relevant_authors():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, relevant_authors_tab = result
- return ui.HTML(DT(relevant_authors_tab, style="width=100%;"))
+ return ui.HTML(DT(relevant_authors_tab, style="width:100%;"))
# --- Most Local Cited Authors Section ---
with ui.nav_panel("None", value="most_local_cited_authors"):
@@ -2376,7 +2377,7 @@ def loading_modal():
ui.modal_show(loading_modal())
try:
num_of_cited_authors = input.num_of_cited_authors()
- result = get_local_cited_authors(df, num_of_cited_authors)
+ result = get_local_cited_authors(df.get(), num_of_cited_authors)
local_cited_authors_result.set(result)
finally:
ui.modal_remove()
@@ -2421,7 +2422,7 @@ def table_local_cited_authors():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, local_cited_authors_tab = result
- return ui.HTML(DT(local_cited_authors_tab, style="width=100%;"))
+ return ui.HTML(DT(local_cited_authors_tab, style="width:100%;"))
# --- Authors' Production over Time Section ---
with ui.nav_panel("None", value="authors_production"):
@@ -2521,7 +2522,7 @@ def loading_modal():
ui.modal_show(loading_modal())
try:
top_k_authors = input.TopAuthorsProdK()
- result = get_author_production_over_time(df, top_k_authors)
+ result = get_author_production_over_time(df.get(), top_k_authors)
au_over_time_result.set(result)
finally:
ui.modal_remove()
@@ -2566,7 +2567,7 @@ def table_authors_production():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, table_authors_production, _ = result
- return ui.HTML(DT(table_authors_production, style="width=100%;"))
+ return ui.HTML(DT(table_authors_production, style="width:100%;"))
with ui.nav_panel("Table - Documents"):
@render.ui
@@ -2584,7 +2585,7 @@ def table_documents():
table_documents['DOI'] = table_documents['DOI'].apply(
lambda x: f'{x}' if x != "N/A" else x
)
- return ui.HTML(DT(table_documents, style="width=100%;"))
+ return ui.HTML(DT(table_documents, style="width:100%;"))
# AI bot Gemini Chat Integration
# --- Floating Chat Button ---
@render.express()
@@ -2723,7 +2724,7 @@ def show_lotkas_law_report():
with ui.card(full_screen=True):
@reactive.calc
def lotka_law():
- return get_lotka_law(df)
+ return get_lotka_law(df.get())
with ui.navset_underline(id="lotka_law_tab"):
with ui.nav_panel("Plot"):
@@ -2736,7 +2737,7 @@ def show_lotka_law():
@render.ui
def table_lotka_law():
_, lotka_law_tab = lotka_law()
- return ui.HTML(DT(lotka_law_tab, style="width=100%;"))
+ return ui.HTML(DT(lotka_law_tab, style="width:100%;"))
# --- Authors' Local Impact Section ---
with ui.nav_panel("None", value="authors_local_impact"):
@@ -2837,7 +2838,7 @@ def loading_modal():
try:
num_of_authors_local_impact = input.num_of_authors_local_impact()
author_local_impact = input.author_local_impact()
- result = get_authors_local_impact(df, num_of_authors_local_impact, author_local_impact)
+ result = get_authors_local_impact(df.get(), num_of_authors_local_impact, author_local_impact)
authors_local_impact_result.set(result)
finally:
ui.modal_remove()
@@ -2883,7 +2884,7 @@ def table_authors_local_impact():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, authors_local_impact_tab = result
- return ui.HTML(DT(authors_local_impact_tab, style="width=100%;"))
+ return ui.HTML(DT(authors_local_impact_tab, style="width:100%;"))
# --- Most Relevant Affiliations Section ---
with ui.nav_panel("None", value="most_relevant_affiliations"):
@@ -2984,7 +2985,7 @@ def loading_modal():
try:
num_of_affiliations = input.num_of_affiliations()
disambiguation = input.disambiguation()
- result = get_relevant_affiliations(df, num_of_affiliations, disambiguation)
+ result = get_relevant_affiliations(df.get(), num_of_affiliations, disambiguation)
relevant_affiliations_result.set(result)
finally:
ui.modal_remove()
@@ -3030,7 +3031,7 @@ def table_relevant_affiliations():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, relevant_affiliations_tab = result
- return ui.HTML(DT(relevant_affiliations_tab, style="width=100%;"))
+ return ui.HTML(DT(relevant_affiliations_tab, style="width:100%;"))
# --- Affiliations' Production over Time Section ---
with ui.nav_panel("None", value="affiliations_production"):
@@ -3137,7 +3138,7 @@ def loading_modal():
ui.modal_show(loading_modal())
try:
top_k_affiliations = input.TopAffProdK()
- result = get_affiliation_production_over_time(df, top_k_affiliations)
+ result = get_affiliation_production_over_time(df.get(), top_k_affiliations)
affiliations_production_results.set(result)
finally:
ui.modal_remove()
@@ -3172,7 +3173,7 @@ def table_affiliations_production():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, table_affiliations_production = result
- return ui.HTML(DT(table_affiliations_production, style="width=100%;"))
+ return ui.HTML(DT(table_affiliations_production, style="width:100%;"))
# --- Affiliations' Local Impact Section ---
with ui.nav_panel("None", value="corresponding_authors"):
@@ -3281,7 +3282,7 @@ def loading_modal():
ui.modal_show(loading_modal())
try:
top_k_countries = input.TopCountries()
- result = get_corresponding_author_countries(df, top_k_countries)
+ result = get_corresponding_author_countries(df.get(), top_k_countries)
corresponding_authors_results.set(result)
finally:
ui.modal_remove()
@@ -3316,7 +3317,7 @@ def table_countries_collaboration():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, countries_table = result
- return ui.HTML(DT(countries_table, style="width=100%;"))
+ return ui.HTML(DT(countries_table, style="width:100%;"))
# --- Countries' Scientific Production Section ---
with ui.nav_panel("None", value="countries_scientific_production"):
@@ -3406,7 +3407,7 @@ def loading_modal():
ui.modal_show(loading_modal())
try:
- result = get_countries_production(df)
+ result = get_countries_production(df.get())
return result
finally:
ui.modal_remove()
@@ -3422,7 +3423,7 @@ def show_countries_production():
@render.ui
def table_countries_production():
_, countries_table = countries_production()
- return ui.HTML(DT(countries_table, style="width=100%;"))
+ return ui.HTML(DT(countries_table, style="width:100%;"))
# --- Countries' Production over Time Section ---
with ui.nav_panel("None", value="countries_production_over_time"):
@@ -3531,7 +3532,7 @@ def loading_modal():
ui.modal_show(loading_modal())
try:
top_k_countries = input.TopCountriesProdK()
- result = get_countries_production_over_time(df, top_k_countries)
+ result = get_countries_production_over_time(df.get(), top_k_countries)
countries_over_time_results.set(result)
finally:
ui.modal_remove()
@@ -3566,7 +3567,7 @@ def table_countries_over_time():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, countries_table = result
- return ui.HTML(DT(countries_table, style="width=100%;"))
+ return ui.HTML(DT(countries_table, style="width:100%;"))
# --- Most Cited Countries Section ---
with ui.nav_panel("None", value="most_cited_countries"):
@@ -3677,7 +3678,7 @@ def loading_modal():
try:
num_of_cited_countries = input.num_of_cited_countries()
cited_countries_measure = input.cited_countries()
- result = get_cited_countries(df, num_of_cited_countries, cited_countries_measure)
+ result = get_cited_countries(df.get(), num_of_cited_countries, cited_countries_measure)
cited_countries_results.set(result)
finally:
ui.modal_remove()
@@ -3712,7 +3713,7 @@ def table_cited_countries():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, cited_countries_tab = result
- return ui.HTML(DT(cited_countries_tab, style="width=100%;"))
+ return ui.HTML(DT(cited_countries_tab, style="width:100%;"))
# --- Most Global Cited Documents Section ---
with ui.nav_panel("None", value="most_global_cited_documents"):
@@ -3817,7 +3818,7 @@ def loading_modal():
try:
num_of_cited_docs = input.num_of_cited_docs()
cited_docs = input.cited_docs()
- result = get_cited_documents(df, num_of_cited_docs, cited_docs)
+ result = get_cited_documents(df.get(), num_of_cited_docs, cited_docs)
cited_documents_results.set(result)
finally:
ui.modal_remove()
@@ -3852,7 +3853,7 @@ def table_cited_documents():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, cited_documents_tab = result
- return ui.HTML(DT(cited_documents_tab, style="width=100%;"))
+ return ui.HTML(DT(cited_documents_tab, style="width:100%;"))
# --- Most Local Cited Documents Section ---
with ui.nav_panel("None", value="most_local_cited_documents"):
@@ -3964,7 +3965,7 @@ def loading_modal():
# Run analysis
num_of_local_cited_docs = input.num_of_local_cited_docs()
field_separator = input.field_separator()
- result = get_local_cited_documents(df, num_of_local_cited_docs, field_separator)
+ result = get_local_cited_documents(df.get(), num_of_local_cited_docs, field_separator)
local_cited_documents_results.set(result)
finally:
ui.modal_remove()
@@ -3998,7 +3999,7 @@ def table_local_cited_documents():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, local_cited_documents_tab = result
- return ui.HTML(DT(local_cited_documents_tab, style="width=100%;"))
+ return ui.HTML(DT(local_cited_documents_tab, style="width:100%;"))
# --- Most Local Cited References Section ---
with ui.nav_panel("None", value="most_local_cited_references"):
@@ -4110,7 +4111,7 @@ def loading_modal():
# Run analysis
num_of_cited_refs = input.num_of_cited_refs()
field_separator_ref = input.field_separator_ref()
- result = get_local_cited_refs(df, num_of_cited_refs, field_separator_ref)
+ result = get_local_cited_refs(df.get(), num_of_cited_refs, field_separator_ref)
local_cited_refs_results.set(result)
finally:
ui.modal_remove()
@@ -4144,7 +4145,7 @@ def table_local_cited_refs():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, local_cited_refs_tab = result
- return ui.HTML(DT(local_cited_refs_tab, style="width=100%;"))
+ return ui.HTML(DT(local_cited_refs_tab, style="width:100%;"))
# --- References Spectroscopy Section ---
with ui.nav_panel("None", value="references_spectroscopy"):
@@ -4260,7 +4261,7 @@ def loading_modal():
start_year = input.start_year()
end_year = input.end_year()
field_separator_spec = input.field_separator_spec()
- result = get_references_spectroscopy(df, start_year, end_year, field_separator_spec)
+ result = get_references_spectroscopy(df.get(), start_year, end_year, field_separator_spec)
ref_spectroscopy_results.set(result)
finally:
ui.modal_remove()
@@ -4294,7 +4295,7 @@ def table_references_rpy():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, ref_rpy_tab, _ = result
- return ui.HTML(DT(ref_rpy_tab, style="width=100%;"))
+ return ui.HTML(DT(ref_rpy_tab, style="width:100%;"))
with ui.nav_panel("Table - Cited References"):
@render.ui
@@ -4306,7 +4307,7 @@ def table_references_spectroscopy():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, _, ref_spectroscopy_tab = result
- return ui.HTML(DT(ref_spectroscopy_tab, style="width=100%;"))
+ return ui.HTML(DT(ref_spectroscopy_tab, style="width:100%;"))
# --- Most Frequent Words ---
with ui.nav_panel("None", value="most_frequent_words"):
@@ -4470,7 +4471,7 @@ def loading_modal():
file_upload_synonyms_mfw = None
synonyms_data_mfw = None
- result = get_frequent_words(df, ngram_mfw, num_of_words_mfw, field_mfw, file_upload_terms_mfw, file_upload_synonyms_mfw)
+ result = get_frequent_words(df.get(), ngram_mfw, num_of_words_mfw, field_mfw, file_upload_terms_mfw, file_upload_synonyms_mfw)
frequent_words_results.set(result)
except Exception as e:
ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10)
@@ -4524,7 +4525,7 @@ def table_frequent_words():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, frequent_words_tab = result
- return ui.HTML(DT(frequent_words_tab, style="width=100%;"))
+ return ui.HTML(DT(frequent_words_tab, style="width:100%;"))
# --- WordCloud Section ---
with ui.nav_panel("None", value="wordcloud"):
@@ -4688,7 +4689,7 @@ def loading_modal():
file_upload_synonyms_wc = None
synonyms_data_wc = None
- result = get_wordcloud(df, ngram_wc, num_of_words_wc, field_wc, file_upload_terms_wc, file_upload_synonyms_wc)
+ result = get_wordcloud(df.get(), ngram_wc, num_of_words_wc, field_wc, file_upload_terms_wc, file_upload_synonyms_wc)
wordcloud_results.set(result)
except Exception as e:
ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10)
@@ -4742,7 +4743,7 @@ def table_wordcloud():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, wordcloud_tab = result
- return ui.HTML(DT(wordcloud_tab, style="width=100%;"))
+ return ui.HTML(DT(wordcloud_tab, style="width:100%;"))
# --- TreeMap Section ---
with ui.nav_panel("None", value="treemap"):
@@ -4906,7 +4907,7 @@ def loading_modal():
file_upload_synonyms_tm = None
synonyms_data_tm = None
- result = get_treemap(df, ngram_tm, num_of_words_tm, field_tm, file_upload_terms_tm, file_upload_synonyms_tm)
+ result = get_treemap(df.get(), ngram_tm, num_of_words_tm, field_tm, file_upload_terms_tm, file_upload_synonyms_tm)
treemap_results.set(result)
except Exception as e:
ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10)
@@ -4960,7 +4961,7 @@ def table_treemap():
style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
)
_, treemap_tab = result
- return ui.HTML(DT(treemap_tab, style="width=100%;"))
+ return ui.HTML(DT(treemap_tab, style="width:100%;"))
# --- References Spectroscopy Section ---
with ui.nav_panel("None", value="words_frequency_over_time"):
@@ -5127,7 +5128,7 @@ def loading_modal():
file_upload_synonyms_wf = None
synonyms_data_wf = None
- result = get_word_frequency(df, ngram_wf, field_wf, file_upload_terms_wf, file_upload_synonyms_wf, occurrences, top_words)
+ result = get_word_frequency(df.get(), ngram_wf, field_wf, file_upload_terms_wf, file_upload_synonyms_wf, occurrences, top_words)
word_frequency_results.set(result)
except Exception as e:
ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10)
@@ -5244,7 +5245,7 @@ def get_ngrams_tt():
@render.express()
def show_timespan():
data_temp = main_informations()
- ui.input_slider("time_window", "Timespan", sep="", ticks=True, min=data_temp['Min_Year'][0], max=data_temp['Max_Year'][0], value=[data_temp['Min_Year'][0], data_temp['Max_Year'][0]], step=1, time_format="YYYY")
+ ui.input_slider("time_window", "Timespan", sep="", ticks=True, min=data_temp["Min_Year"].iloc[0], max=data_temp["Max_Year"].iloc[0], value=[data_temp["Min_Year"].iloc[0], data_temp["Max_Year"].iloc[0]], step=1, time_format="YYYY")
with ui.accordion(id="acc_tt", multiple=True, open=False):
with ui.accordion_panel("Text Editing"):
@@ -5357,7 +5358,7 @@ def loading_modal():
word_mimimum_frequency = input.word_mimimum_frequency()
number_of_words_year = input.number_of_words_year()
- result = get_trend_topics(df, ngram_tt, field_tt, time_window, file_upload_terms_tt, file_upload_synonyms_tt, word_mimimum_frequency, number_of_words_year)
+ result = get_trend_topics(df.get(), ngram_tt, field_tt, time_window, file_upload_terms_tt, file_upload_synonyms_tt, word_mimimum_frequency, number_of_words_year)
trend_topics_results.set(result)
except Exception as e:
ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10)
@@ -5561,7 +5562,7 @@ def loading_modal():
community_repulsion = input.community_repulsion()
clustering_algorithm = input.clustering_algorithm()
- result = get_clustering_coupling(df, unit_of_analysis, coupling_field, stemmer, impact_measure, cluster_labeling, ngram, num_of_units, min_cluster_freq, label_per_cluster, label_size, community_repulsion, clustering_algorithm)
+ result = get_clustering_coupling(df.get(), unit_of_analysis, coupling_field, stemmer, impact_measure, cluster_labeling, ngram, num_of_units, min_cluster_freq, label_per_cluster, label_size, community_repulsion, clustering_algorithm)
clustering_coupling_results.set(result)
except Exception as e:
ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10)
@@ -5848,7 +5849,7 @@ def loading_modal():
modal_content.append(ui.markdown("""
Synonyms to Remove
"""))
modal_content.append(ui.HTML(DT(synonyms_data)))
- result = get_co_occurence_network(df, field_cn, ngram_cn, network_layout, clustering_algorithm_cn, normalization_cn, color_by_year, num_of_nodes,
+ result = get_co_occurence_network(df.get(), field_cn, ngram_cn, network_layout, clustering_algorithm_cn, normalization_cn, color_by_year, num_of_nodes,
repulsion_force, remove_isolated, min_edges, node_opacity, num_of_labels, node_shape, label_size_ls,
edge_size, node_shadow, edit_nodes, label_cex, file_upload_terms, file_upload_synonyms)
co_occurrence_network_results.set(result)
@@ -5895,7 +5896,7 @@ def table_co_occurrence_network():
result = co_occurrence_network_results.get()
if result is not None:
_, _, co_occurrence_network_tab, _ = result
- return ui.HTML(DT(co_occurrence_network_tab, style="width=100%;"))
+ return ui.HTML(DT(co_occurrence_network_tab, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to run co-occurrence network", style="text-align: center; color: #999; font-size: 16px;"),
@@ -6068,7 +6069,7 @@ def loading_modal():
cluster = input.thematic_clustering()
repulsion = input.thematic_repulsion()
- result = get_thematic_map(df, field, n, minfreq, ngram, stemming,
+ result = get_thematic_map(df.get(), field, n, minfreq, ngram, stemming,
label_size, n_labels, repulsion, cluster)
thematic_map_results.set(result)
except Exception as e:
@@ -6116,7 +6117,7 @@ def table_thematic_map():
result = thematic_map_results.get()
if result is not None:
_, _, thematic_map_table, _, _ = result
- return ui.HTML(DT(thematic_map_table, style="width=100%;"))
+ return ui.HTML(DT(thematic_map_table, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to run thematic map", style="text-align: center; color: #999; font-size: 16px;"),
@@ -6129,7 +6130,7 @@ def clusters_thematic_map():
result = thematic_map_results.get()
if result is not None:
_, _, _, thematic_map_cluster, _ = result
- return ui.HTML(DT(thematic_map_cluster, style="width=100%;"))
+ return ui.HTML(DT(thematic_map_cluster, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to run thematic map", style="text-align: center; color: #999; font-size: 16px;"),
@@ -6142,7 +6143,7 @@ def documents_thematic_map():
result = thematic_map_results.get()
if result is not None:
_, _, _, _, thematic_map_documents = result
- return ui.HTML(DT(thematic_map_documents, maxBytes="10MB", style="width=100%;"))
+ return ui.HTML(DT(thematic_map_documents, maxBytes="10MB", style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to run thematic map", style="text-align: center; color: #999; font-size: 16px;"),
@@ -6403,7 +6404,7 @@ def loading_modal():
ngrams = input.thematic_evolution_ngram() if field in ["TI", "AB"] else 1
stemming = input.thematic_evolution_stemmer() if field in ["TI", "AB"] else False
- result = get_thematic_evolution(df, field, years, n, weight_index, min_weight_index, minfreq, label_size, ngrams, stemming, n_labels, overlap, remove_terms, synonyms, cluster)
+ result = get_thematic_evolution(df.get(), field, years, n, weight_index, min_weight_index, minfreq, label_size, ngrams, stemming, n_labels, overlap, remove_terms, synonyms, cluster)
thematic_evolution_results.set(result)
except Exception as e:
ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10)
@@ -6444,7 +6445,7 @@ def table_thematic_evolution():
result = thematic_evolution_results.get()
if result is not None:
_, thematic_evolution_table, _ = result
- return ui.HTML(DT(thematic_evolution_table, style="width=100%;"))
+ return ui.HTML(DT(thematic_evolution_table, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
@@ -6483,7 +6484,7 @@ def table_thematic_evolution_2():
if result is not None:
_, _, TM = result
if len(TM) > 0:
- return ui.HTML(DT(TM[0]["words"], style="width=100%;"))
+ return ui.HTML(DT(TM[0]["words"], style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6496,7 +6497,7 @@ def clusters_thematic_evolution_2():
if result is not None:
_, _, TM = result
if len(TM) > 0:
- return ui.HTML(DT(TM[0]["clusters"], style="width=100%;"))
+ return ui.HTML(DT(TM[0]["clusters"], style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6509,7 +6510,7 @@ def documents_thematic_evolution_2():
if result is not None:
_, _, TM = result
if len(TM) > 0:
- return ui.HTML(DT(TM[0]["documentToClusters"], maxBytes="10MB", style="width=100%;"))
+ return ui.HTML(DT(TM[0]["documentToClusters"], maxBytes="10MB", style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6547,7 +6548,7 @@ def table_thematic_evolution_3():
if result is not None:
_, _, TM = result
if len(TM) > 1:
- return ui.HTML(DT(TM[1]["words"], style="width=100%;"))
+ return ui.HTML(DT(TM[1]["words"], style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6560,7 +6561,7 @@ def clusters_thematic_evolution_3():
if result is not None:
_, _, TM = result
if len(TM) > 1:
- return ui.HTML(DT(TM[1]["clusters"], style="width=100%;"))
+ return ui.HTML(DT(TM[1]["clusters"], style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6573,7 +6574,7 @@ def documents_thematic_evolution_3():
if result is not None:
_, _, TM = result
if len(TM) > 1:
- return ui.HTML(DT(TM[1]["documentToClusters"], maxBytes="10MB", style="width=100%;"))
+ return ui.HTML(DT(TM[1]["documentToClusters"], maxBytes="10MB", style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6611,7 +6612,7 @@ def table_thematic_evolution_4():
if result is not None:
_, _, TM = result
if len(TM) > 2:
- return ui.HTML(DT(TM[2]["words"], style="width=100%;"))
+ return ui.HTML(DT(TM[2]["words"], style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6624,7 +6625,7 @@ def clusters_thematic_evolution_4():
if result is not None:
_, _, TM = result
if len(TM) > 2:
- return ui.HTML(DT(TM[2]["clusters"], style="width=100%;"))
+ return ui.HTML(DT(TM[2]["clusters"], style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6637,7 +6638,7 @@ def documents_thematic_evolution_4():
if result is not None:
_, _, TM = result
if len(TM) > 2:
- return ui.HTML(DT(TM[2]["documentToClusters"], maxBytes="10MB", style="width=100%;"))
+ return ui.HTML(DT(TM[2]["documentToClusters"], maxBytes="10MB", style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6675,7 +6676,7 @@ def table_thematic_evolution_5():
if result is not None:
_, _, TM = result
if len(TM) > 3:
- return ui.HTML(DT(TM[3]["words"], style="width=100%;"))
+ return ui.HTML(DT(TM[3]["words"], style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6688,7 +6689,7 @@ def clusters_thematic_evolution_5():
if result is not None:
_, _, TM = result
if len(TM) > 3:
- return ui.HTML(DT(TM[3]["clusters"], style="width=100%;"))
+ return ui.HTML(DT(TM[3]["clusters"], style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6701,7 +6702,7 @@ def documents_thematic_evolution_5():
if result is not None:
_, _, TM = result
if len(TM) > 3:
- return ui.HTML(DT(TM[3]["documentToClusters"], maxBytes="10MB", style="width=100%;"))
+ return ui.HTML(DT(TM[3]["documentToClusters"], maxBytes="10MB", style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6739,7 +6740,7 @@ def table_thematic_evolution_6():
if result is not None:
_, _, TM = result
if len(TM) > 4:
- return ui.HTML(DT(TM[4]["words"]), style="width=100%;")
+ return ui.HTML(DT(TM[4]["words"]), style="width:100%;")
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6752,7 +6753,7 @@ def clusters_thematic_evolution_6():
if result is not None:
_, _, TM = result
if len(TM) > 4:
- return ui.HTML(DT(TM[4]["clusters"], style="width=100%;"))
+ return ui.HTML(DT(TM[4]["clusters"], style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6765,7 +6766,7 @@ def documents_thematic_evolution_6():
if result is not None:
_, _, TM = result
if len(TM) > 4:
- return ui.HTML(DT(TM[4]["documentToClusters"], maxBytes="10MB", style="width=100%;"))
+ return ui.HTML(DT(TM[4]["documentToClusters"], maxBytes="10MB", style="width:100%;"))
return ui.div(
ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"),
style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;"
@@ -6995,7 +6996,7 @@ def loading_modal():
labelsize=input.wordmap_labelsize()
size=input.wordmap_dot_size()
- result = get_factorial_analysis(df, ngram, field, terms_data_wm, synonyms_data_wm, n_terms, n_clusters, num_documents, method, dimX, dimY, topWordPlot, threshold, labelsize, size)
+ result = get_factorial_analysis(df.get(), ngram, field, terms_data_wm, synonyms_data_wm, n_terms, n_clusters, num_documents, method, dimX, dimY, topWordPlot, threshold, labelsize, size)
factorial_analysis_results.set(result)
except Exception as e:
ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10)
@@ -7051,7 +7052,7 @@ def show_words_by_cluster():
result = factorial_analysis_results.get()
if result is not None:
_, _, words_by_cluster, _ = result
- return ui.HTML(DT(words_by_cluster, style="width=100%;"))
+ return ui.HTML(DT(words_by_cluster, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to run factorial analysis", style="text-align: center; color: #999; font-size: 16px;"),
@@ -7064,7 +7065,7 @@ def show_articles_by_cluster():
result = factorial_analysis_results.get()
if result is not None:
_, _, _, articles_by_cluster = result
- return ui.HTML(DT(articles_by_cluster, style="width=100%;"))
+ return ui.HTML(DT(articles_by_cluster, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to run factorial analysis", style="text-align: center; color: #999; font-size: 16px;"),
@@ -7186,7 +7187,7 @@ def loading_modal():
# Execute analysis
result = get_co_citation(
- df=df,
+ df=df.get(),
field=field,
sep=sep,
cocit_network_layout=cocit_network_layout,
@@ -7345,7 +7346,7 @@ def show_cocitation_table():
result = co_citation_network_results.get()
if result is not None:
_, _, cocit_table, _ = result
- return ui.HTML(DT(cocit_table, style="width=100%;"))
+ return ui.HTML(DT(cocit_table, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to generate the co-citation table.", style="text-align: center; color: #666; font-size: 16px;"),
@@ -7474,7 +7475,7 @@ def loading_modal():
histsize = input.histsize()
# Execute analysis with correct parameters
result = get_historiograph(
- df=df,
+ df=df.get(),
node_label="AU1",
histNodes=histNodes,
hist_isolates=True,
@@ -7560,7 +7561,7 @@ def show_hist_table():
result = historiograph_results.get()
if result is not None:
_, hist_tab, _ = result
- return ui.HTML(DT(hist_tab, style="width=100%;"))
+ return ui.HTML(DT(hist_tab, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to generate the historiograph table.", style="text-align: center; color: #666; font-size: 16px;"),
@@ -7690,7 +7691,7 @@ def loading_modal():
# Execute analysis
result = get_collaboration_network(
- df=df,
+ df=df.get(),
field=field,
network_layout=network_layout,
clustering_algorithm=clustering_algorithm,
@@ -7865,7 +7866,7 @@ def show_collaboration_table():
result = collaboration_network_results.get()
if result is not None:
_, _, collab_table, _ = result
- return ui.HTML(DT(collab_table, style="width=100%;"))
+ return ui.HTML(DT(collab_table, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to generate the collaboration table.", style="text-align: center; color: #666; font-size: 16px;"),
@@ -7987,7 +7988,7 @@ def loading_modal():
try:
# Execute analysis (with default parameters for world map collaboration)
result = get_world_map_collaboration(
- df=df,
+ df=df.get(),
edges_min=1,
edgesize=5
)
@@ -8045,7 +8046,7 @@ def show_world_map_collaboration_table():
result = countries_collaboration_network_results.get()
if result is not None:
_, world_map_table = result
- return ui.HTML(DT(world_map_table, style="width=100%;"))
+ return ui.HTML(DT(world_map_table, style="width:100%;"))
else:
return ui.div(
ui.p("Click the Run Analysis button to generate the world map collaboration table.", style="text-align: center; color: #666; font-size: 16px;"),
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 000000000..bc39756b7
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,8 @@
+extraction:
+ query: "bibliometrics"
+ max_results: 50
+ source: "OPENALEX"
+
+paths:
+ output_csv: "standardized_output.csv"
+ log_file: "pipeline.log"
\ No newline at end of file
diff --git a/functions/get_affiliationproductionovertime.py b/functions/get_affiliationproductionovertime.py
index e1b87f583..310ee2fe7 100644
--- a/functions/get_affiliationproductionovertime.py
+++ b/functions/get_affiliationproductionovertime.py
@@ -12,9 +12,10 @@ def get_affiliation_production_over_time(df, top_k_affiliations):
Returns:
A Plotly figure object representing the affiliation's production over time.
"""
- data = df.get()
+ data = df
- AFF = data["AU_UN"].dropna().apply(lambda x: [aff for aff in x if aff.strip() != ""])
+ AFF_series = data["AU_UN"].fillna("").apply(lambda x: [aff for aff in (x if isinstance(x, list) else str(x).split(";")) if str(aff).strip() not in ["", "nan"]])
+ AFF = AFF_series
nAFF = [len(aff) for aff in AFF]
affiliations = [aff for sublist in AFF for aff in sublist]
diff --git a/functions/get_annualproduction.py b/functions/get_annualproduction.py
index dd27105c2..99166bb32 100644
--- a/functions/get_annualproduction.py
+++ b/functions/get_annualproduction.py
@@ -11,7 +11,7 @@ def get_annual_production(df):
Returns:
A Plotly figure object representing the annual scientific production.
"""
- data = df.get()
+ data = df
# Calculate the number of publications per year
publications_per_year = data["PY"].value_counts().sort_index().reset_index()
diff --git a/functions/get_authorlocalimpact.py b/functions/get_authorlocalimpact.py
index 74a68e263..bf9a88c21 100644
--- a/functions/get_authorlocalimpact.py
+++ b/functions/get_authorlocalimpact.py
@@ -13,7 +13,7 @@ def get_authors_local_impact(df, num_of_authors_local_impact, author_local_impac
Returns:
A Plotly figure object and a DataFrame of the most impactful sources.
"""
- df = df.get()
+ df = df
today = pd.Timestamp.now().year
# Ensure 'TC' and 'PY' are numeric
diff --git a/functions/get_authorproductionovertime.py b/functions/get_authorproductionovertime.py
index 65edaca96..ba1bf0a4d 100644
--- a/functions/get_authorproductionovertime.py
+++ b/functions/get_authorproductionovertime.py
@@ -16,7 +16,7 @@ def get_author_production_over_time(df, top_k_authors):
table_authors_production (pd.DataFrame): Table summarizing authors' production with TC and TCpY.
table_documents (pd.DataFrame): Detailed table with additional document information.
"""
- data = df.get()
+ data = df
# Ensure "PY" is numeric
data["PY"] = pd.to_numeric(data["PY"], errors="coerce")
diff --git a/functions/get_averagecitations.py b/functions/get_averagecitations.py
index d752aa9b7..638a14849 100644
--- a/functions/get_averagecitations.py
+++ b/functions/get_averagecitations.py
@@ -11,7 +11,7 @@ def get_average_citations(df):
Returns:
A Plotly figure object representing the average citations per year.
"""
- data = df.get()
+ data = df
# Calculate the current year
current_year = pd.Timestamp.now().year + 1
diff --git a/functions/get_bradfordlaw.py b/functions/get_bradfordlaw.py
index 86580591f..5ff8a0fec 100644
--- a/functions/get_bradfordlaw.py
+++ b/functions/get_bradfordlaw.py
@@ -12,7 +12,7 @@ def get_bradford_law(df):
A Plotly figure object and a DataFrame of the Bradford's Law zones.
"""
# Sort data by frequency of occurrence (equivalent to R's sort(table(M$SO), decreasing = TRUE))
- data = df.get()
+ data = df
source_counts = data["SO"].value_counts()
# Total number of sources
@@ -67,7 +67,7 @@ def get_bradford_law(df):
fig.add_shape(
type="rect",
x0=0,
- x1=np.log(df_bradford["Rank"][a]),
+ x1=np.log(df_bradford["Rank"].iloc[int(a)-1]),
y0=0,
y1=df_bradford["Freq"].max(),
fillcolor="#B3D1F2",
@@ -78,7 +78,7 @@ def get_bradford_law(df):
# Add the "Core Sources" annotation with smaller font
fig.add_annotation(
- x=np.log(df_bradford["Rank"][a]) / 2,
+ x=np.log(df_bradford["Rank"].iloc[int(a)-1]) / 2,
y=df_bradford["Freq"].max() * 0.85,
text="Core
Sources",
showarrow=False,
diff --git a/functions/get_bradfordlaw.py.bak b/functions/get_bradfordlaw.py.bak
new file mode 100644
index 000000000..569aaa4d3
--- /dev/null
+++ b/functions/get_bradfordlaw.py.bak
@@ -0,0 +1,126 @@
+from www.services import *
+
+
+def get_bradford_law(df):
+ """
+ Generate a plot and table based on Bradford's Law.
+
+ Args:
+ df: A DataFrame object containing the data.
+
+ Returns:
+ A Plotly figure object and a DataFrame of the Bradford's Law zones.
+ """
+ # Sort data by frequency of occurrence (equivalent to R's sort(table(M$SO), decreasing = TRUE))
+ data = df
+ source_counts = data["SO"].value_counts()
+
+ # Total number of sources
+ n = source_counts.sum()
+ # Cumulative sum of the frequencies (equivalent to cumsum in R)
+ cumSO = source_counts.cumsum()
+
+ # Define the cut points for Bradford's Law (zones)
+ cutpoints = [1, n * 0.33, n * 0.67, float('inf')]
+ groups = pd.cut(cumSO, bins=cutpoints, labels=["Zone 1", "Zone 2", "Zone 3"])
+
+ # Find the cut points for "Core" sources
+ a = (cumSO < n * 0.33).sum() + 1
+ b = (cumSO < n * 0.67).sum() + 1
+ Z = ["Zone 1"] * a + ["Zone 2"] * (b - a) + ["Zone 3"] * (len(cumSO) - b)
+
+ # Create a DataFrame for Bradford's Law table
+ df_bradford = pd.DataFrame({
+ "SO": cumSO.index.str[:25], # Shorten the source names to 25 characters if necessary
+ "Rank": range(1, len(cumSO) + 1),
+ "Freq": source_counts.values,
+ "cumFreq": cumSO.values,
+ "Zone": Z
+ })
+
+ # Create the Plotly figure
+ fig = go.Figure()
+
+ # Add the line plot without text above the points
+ fig.add_trace(go.Scatter(
+ x=np.log(df_bradford["Rank"]),
+ y=df_bradford["Freq"],
+ mode='lines+markers',
+ name='Articles per Source',
+ marker=dict(
+ color='#5567BB',
+ size=10,
+ line=dict(width=1, color='white'),
+ opacity=0.95
+ ),
+ line=dict(color='#5567BB', width=2, shape='spline'),
+ hovertemplate=(
+ "Source: %{customdata[0]}
"
+ "Rank: %{x:.2f}
"
+ "N. of Documents: %{y}
"
+ "Zone: %{customdata[1]}"
+ ),
+ customdata=np.stack([df_bradford["SO"], df_bradford["Zone"]], axis=-1)
+ ))
+
+ # Add the "Core Sources" area with the rectangle
+ fig.add_shape(
+ type="rect",
+ x0=0,
+ x1=np.log(df_bradford["Rank"][a]),
+ y0=0,
+ y1=df_bradford["Freq"].max(),
+ fillcolor="#B3D1F2",
+ opacity=0.18,
+ line_width=0,
+ layer="below"
+ )
+
+ # Add the "Core Sources" annotation with smaller font
+ fig.add_annotation(
+ x=np.log(df_bradford["Rank"][a]) / 2,
+ y=df_bradford["Freq"].max() * 0.85,
+ text="Core
Sources",
+ showarrow=False,
+ font=dict(size=15, color="#5567BB", family="Segoe UI, Arial"),
+ align="center",
+ bgcolor="rgba(255,255,255,0.7)",
+ bordercolor="#B3D1F2",
+ borderpad=4,
+ borderwidth=1,
+ )
+
+ # Customize the X axis labels (log scale) with smaller font
+ fig.update_layout(
+ xaxis=dict(
+ title="Source log(Rank)",
+ tickmode='array',
+ tickvals=np.log(df_bradford["Rank"][:a]),
+ ticktext=df_bradford["SO"][:a],
+ tickangle=90,
+ showgrid=True,
+ gridcolor="#F0F0F0",
+ zeroline=False,
+ tickfont=dict(size=10),
+ ),
+ yaxis=dict(
+ title="N. of Documents",
+ showgrid=True,
+ gridcolor="#F0F0F0",
+ zeroline=False,
+ tickfont=dict(size=10),
+ ),
+ plot_bgcolor='white',
+ font=dict(color="#222222", size=11, family="Segoe UI, Arial"),
+ margin=dict(l=80, r=40, t=40, b=120),
+ height=800,
+ showlegend=False,
+ hoverlabel=dict(
+ bgcolor="white",
+ font_size=11,
+ font_family="Segoe UI, Arial",
+ bordercolor="#5567BB"
+ ),
+ )
+
+ return fig, df_bradford
diff --git a/functions/get_citedcountries.py b/functions/get_citedcountries.py
index ac95a8d0c..07d9d65a2 100644
--- a/functions/get_citedcountries.py
+++ b/functions/get_citedcountries.py
@@ -15,7 +15,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure):
"""
# Extract metadata tags for cited countries
df = metaTagExtraction(df, "AU1_CO")
- df = df.get()
+ df = df
# Prepare the table for ranking countries
tab = (
@@ -100,6 +100,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure):
# Set x-axis ticks
max_x = x_values.max()
+ max_x = 0 if pd.isna(max_x) else max_x
tick_step = 5 if max_x <= 50 else int(max_x // 10) or 1
x_ticks = list(range(0, int(max_x) + tick_step, tick_step))
if x_ticks[-1] < max_x:
diff --git a/functions/get_citeddocuments.py b/functions/get_citeddocuments.py
index 14491f74a..badf146ae 100644
--- a/functions/get_citeddocuments.py
+++ b/functions/get_citeddocuments.py
@@ -14,8 +14,8 @@ def get_cited_documents(df, num_of_cited_docs, cited_docs_measure):
A Plotly figure object and a DataFrame of the most cited documents.
"""
# Extract metadata tags for cited documents
- df = metaTagExtraction(df, "SR")
- df = df.get()
+ if "SR" not in df.columns or (df["SR"] == "").all():
+ df = metaTagExtraction(df, "SR")
# Prepare the table for ranking documents
current_year = pd.to_datetime("today").year
diff --git a/functions/get_co_occurence_network.py b/functions/get_co_occurence_network.py
index ec96b143a..51ff616d0 100644
--- a/functions/get_co_occurence_network.py
+++ b/functions/get_co_occurence_network.py
@@ -136,7 +136,7 @@ def get_co_occurence_network(df, field_cn, ngram, network_layout, clustering_alg
# Generate layout
# Using default igraph layout
- layout = cocnet['graph']['layout']
+ layout = cocnet['layout']
print("Layout:", layout)
# Get coordinates from layout
coords = np.array([[pos[0], pos[1]] for pos in layout])
@@ -479,7 +479,7 @@ def field_by_year(df, field_cn, timespan=None, min_freq=2, n_items=5, remove_ter
The field to analyze ('ID', 'DE', 'TI', 'AB', 'WC')
"""
# Get the field data
- M = df.get()
+ M = df
# Create co-occurrence matrix
A = cocMatrix(df, field_cn, binary=False, remove_terms=remove_terms, synonyms=synonyms)
diff --git a/functions/get_cocitation.py b/functions/get_cocitation.py
index 8bad105c0..a90f628a9 100644
--- a/functions/get_cocitation.py
+++ b/functions/get_cocitation.py
@@ -95,7 +95,7 @@ def get_co_citation(
b = np.random.randint(0, 255)
cluster_colors[cluster_id] = f"rgba({r},{g},{b},0.7)"
- layout = cocitnet['graph']['layout']
+ layout = cocitnet['layout']
coords = np.array([[pos[0], pos[1]] for pos in layout])
coords = coords / np.abs(coords).max()
coords[:, 0] *= 1000
diff --git a/functions/get_collaborationnetwork.py b/functions/get_collaborationnetwork.py
index 512ed7489..88213b9c5 100644
--- a/functions/get_collaborationnetwork.py
+++ b/functions/get_collaborationnetwork.py
@@ -46,7 +46,7 @@ def get_collaboration_network(
print("Generating collaboration network...")
M = df
- m = df.get()
+ m = df
NetRefs = None
Title = ""
@@ -108,7 +108,7 @@ def get_collaboration_network(
b = np.random.randint(0, 255)
cluster_colors[cluster_id] = f"rgba({r},{g},{b},{opacity})"
- layout = netplot['graph']['layout']
+ layout = netplot['layout']
coords = np.array([[pos[0], pos[1]] for pos in layout])
coords = coords / np.abs(coords).max()
coords[:, 0] *= 1000
diff --git a/functions/get_correspondingauthorcountries.py b/functions/get_correspondingauthorcountries.py
index 5ba9832b2..f51a0004f 100644
--- a/functions/get_correspondingauthorcountries.py
+++ b/functions/get_correspondingauthorcountries.py
@@ -13,9 +13,11 @@ def get_corresponding_author_countries(df, top_k_countries):
A Plotly figure object and a DataFrame of the most common corresponding author countries.
"""
# Estrai i metadati "AU_CO" e "AU1_CO" e verifica il tipo di dati
- df = metaTagExtraction(df, Field="AU_CO") # Assumendo che `metaTagExtraction` sia già definita
- df = metaTagExtraction(df, Field="AU1_CO")
- data = df.get() # Se `df` è un oggetto reattivo
+ if "AU_CO" not in df.columns or df["AU_CO"].apply(lambda x: isinstance(x, list) and len(x) == 0).all():
+ df = metaTagExtraction(df, Field="AU_CO")
+ if "AU1_CO" not in df.columns or (df["AU1_CO"] == "").all():
+ df = metaTagExtraction(df, Field="AU1_CO")
+ data = df # Se `df` è un oggetto reattivo
# Assicurati che le colonne siano di tipo stringa e rimuovi righe con valori mancanti
data = data.dropna(subset=["AU1_CO", "AU_CO"])
diff --git a/functions/get_countriesproduction.py b/functions/get_countriesproduction.py
index 81c0e0c34..af8f474d0 100644
--- a/functions/get_countriesproduction.py
+++ b/functions/get_countriesproduction.py
@@ -13,7 +13,7 @@ def get_countries_production(df):
"""
# Assicurati che i metadati siano stati estratti
df = metaTagExtraction(df, "AU_CO")
- df = df.get()
+ df = df
# Conta le occorrenze dei paesi
df["AU_CO"] = df["AU_CO"].apply(lambda x: x if isinstance(x, list) else [x])
diff --git a/functions/get_countriesproductionovertime.py b/functions/get_countriesproductionovertime.py
index aede25bbd..8039e12c4 100644
--- a/functions/get_countriesproductionovertime.py
+++ b/functions/get_countriesproductionovertime.py
@@ -13,7 +13,7 @@ def get_countries_production_over_time(df, top_k_countries):
A Plotly figure object representing the country's production over time.
"""
df = metaTagExtraction(df, "AU_CO")
- data = df.get()
+ data = df
AFF = pd.Series(data["AU_CO"]).dropna().apply(lambda x: [aff.strip() for aff in x if aff.strip() != ""])
nAFF = [len(aff) for aff in AFF]
diff --git a/functions/get_factorialanalysis.py b/functions/get_factorialanalysis.py
index 3324bcfb6..1b08b177e 100644
--- a/functions/get_factorialanalysis.py
+++ b/functions/get_factorialanalysis.py
@@ -74,7 +74,7 @@ def get_factorial_analysis(
# Set ngrams based on word_type
ngrams = int(ngram) if field in ['TI', 'AB'] else 1
- M = df.get()
+ M = df
tab = table_tag(M, field, ngrams)
if len(tab) >= 2:
@@ -136,8 +136,8 @@ def get_factorial_analysis(
# Verifica che eigCorr esista prima di accedere
if CS["res"] is not None and hasattr(CS["res"], "eigCorr"):
- xlabel = f"Dim 1 ({CS['res'].eigCorr['perc'][dimX]:.2f}%)"
- ylabel = f"Dim 2 ({CS['res'].eigCorr['perc'][dimY]:.2f}%)"
+ xlabel = f"Dim 1 ({CS["res"].eigCorr["perc"].iloc[dimX-1] if len(CS["res"].eigCorr["perc"]) > dimX-1 else 0:.2f}%)"
+ ylabel = f"Dim 2 ({CS['res'].eigCorr['perc'].iloc[dimY-1] if len(CS['res'].eigCorr['perc']) > dimY-1 else 0:.2f}%)"
else:
xlabel, ylabel = "Dim 1", "Dim 2"
@@ -157,7 +157,8 @@ def get_factorial_analysis(
wordCoord["dotSize"] = wordCoord["dotSize"].replace([np.inf, -np.inf], np.nan)
wordCoord["dotSize"] = wordCoord["dotSize"].fillna(1)
wordCoord["dotSize"] = wordCoord["dotSize"].clip(lower=1)
- thres = sorted(wordCoord["dotSize"], reverse=True)[min(int(topWordPlot), len(wordCoord) - 1)]
+ topWordPlot_safe = min(int(topWordPlot) if np.isfinite(topWordPlot) else len(wordCoord), len(wordCoord) - 1)
+ thres = sorted(wordCoord["dotSize"], reverse=True)[topWordPlot_safe]
wordCoord["labelToPlot"] = np.where(wordCoord["dotSize"] >= thres, wordCoord["label"], "")
# Avoid label overlapping
@@ -950,7 +951,7 @@ def factorial(X, method, n_clusters=5, k_max=5):
# Crea la lista `coord`
coord_df = pd.DataFrame({
"Dim1": cpc[:, 0],
- "Dim2": cpc[:, 1],
+ "Dim2": cpc[:, 1] if cpc.shape[1] > 1 else np.zeros(len(cpc)),
"label": levelnames
})
mask = coord_df["label"].str[-2:] == "_1"
diff --git a/functions/get_factorialanalysis.py.bak b/functions/get_factorialanalysis.py.bak
new file mode 100644
index 000000000..4d299dcf8
--- /dev/null
+++ b/functions/get_factorialanalysis.py.bak
@@ -0,0 +1,1180 @@
+from www.services import *
+from scipy.spatial import ConvexHull, QhullError
+
+def distance_to_y(dist, max_dist, scale_factor):
+ norm = math.log1p(dist) / math.log1p(max_dist)
+ return -norm * scale_factor
+
+def get_leaf_clusters(node, label_to_new_index, labels_lower, node_to_cluster):
+ if node.is_leaf():
+ label = labels_lower[node.id]
+ return {node_to_cluster[label_to_new_index[label]]}
+ left_clusters = get_leaf_clusters(node.left, label_to_new_index, labels_lower, node_to_cluster)
+ right_clusters = get_leaf_clusters(node.right, label_to_new_index, labels_lower, node_to_cluster)
+ return left_clusters.union(right_clusters)
+
+def _to_seq(val) -> List[str]:
+ """Flatten *val* to a list of strings, dropping NaN/None."""
+ if val is None or (isinstance(val, float) and pd.isna(val)):
+ return []
+ if isinstance(val, (list, tuple, set, np.ndarray)):
+ seq: Sequence = val # type: ignore
+ else:
+ seq = [val]
+ out: List[str] = []
+ for x in seq:
+ if x is None or (isinstance(x, float) and pd.isna(x)):
+ continue
+ out.append(str(x))
+ return out
+
+def assign_consistent_colors(clusters):
+ palette = px.colors.qualitative.Plotly
+ unique_clusters = sorted(set(clusters.dropna()))
+ color_map = {cluster: palette[i % len(palette)] for i, cluster in enumerate(unique_clusters)}
+ color_map[np.nan] = "#CCCCCC" # fallback per cluster NaN
+ return color_map
+
+
+def get_factorial_analysis(
+ df: pd.DataFrame,
+ ngram: Union[int, str] = 1,
+ field: str = "ID",
+ terms_data_wm: Optional[Sequence[str]] = None,
+ synonyms_data_wm: Optional[Dict[str, str]] = None,
+ n_terms: int = 50,
+ n_clusters: int = 5,
+ num_documents: Optional[int] = None,
+ method: str = "MCA",
+ dimX: int = 1,
+ dimY: int = 2,
+ topWordPlot: Union[int, float] = np.inf,
+ threshold: float = 0.10,
+ labelsize: int = 16,
+ size: int = 5,
+):
+ """Generate a 2‑D interactive *word map* for bibliometric data."""
+ # Load terms to remove
+ remove_term = None
+ if terms_data_wm:
+ with open(terms_data_wm[0]['datapath'], 'r', encoding='utf-8') as file:
+ remove_term = [line.strip() for line in file]
+
+ # Load synonyms
+ synonym = None
+ if synonyms_data_wm:
+ with open(synonyms_data_wm[0]['datapath'], 'r', encoding='utf-8') as file:
+ synonym = {}
+ for line in file:
+ terms = [term.strip() for term in line.split(',')]
+ key = terms[0]
+ values = terms[1:]
+ synonym[key] = values
+
+ # Set ngrams based on word_type
+ ngrams = int(ngram) if field in ['TI', 'AB'] else 1
+
+ M = df
+ tab = table_tag(M, field, ngrams)
+
+ if len(tab) >= 2:
+ # Get minimum degree threshold from the nth term
+ min_degree = list(tab.values())[min(n_terms, len(tab)-1)]
+
+ CS = conceptual_structure(
+ df=df,
+ method=method,
+ field=field,
+ min_degree=min_degree,
+ n_clusters=n_clusters,
+ k_max=8,
+ stemming=False,
+ labelsize=int(labelsize/2),
+ documents=num_documents,
+ graph=False,
+ ngrams=ngrams,
+ remove_terms=remove_term,
+ synonyms=synonym
+ )
+
+ if method != "MDS":
+ CSData = CS["docCoord"].copy()
+ CSData = CSData.reset_index().rename(columns={"index": "Documents"})
+ CSData["dim1"] = CSData["dim1"].round(2)
+ CSData["dim2"] = CSData["dim2"].round(2)
+ CSData["contrib"] = CSData["contrib"].round(2)
+ CS["CSData"] = CSData
+ else:
+ CS["CSData"] = pd.DataFrame({"Documents": [None], "dim1": [None], "dim2": [None]})
+
+ if method in {"CA", "MCA"}:
+ WData = pd.DataFrame(CS["km_res"]["data"], columns=["Dim1", "Dim2"])
+ WData["word"] = CS["km_res"]["data"].index
+ WData["cluster"] = CS["km_res"]["data"]["cluster"]
+ elif method == "MDS":
+ WData = pd.DataFrame(CS["res"], columns=["Dim1", "Dim2"])
+ WData["word"] = CS["res"].index
+ WData["cluster"] = CS["km_res"]["cluster"]
+
+ WData = WData.round({"Dim1": 2, "Dim2": 2})
+ CS["WData"] = WData
+
+ LABEL = WData["word"]
+
+ if method in {"CA", "MCA"}:
+ WData = CS["km_res"]["data"].copy()
+ WData = WData.reset_index().rename(columns={"index": "word"})
+ if "cluster" not in WData.columns and "cluster" in CS["km_res"]:
+ WData["cluster"] = CS["km_res"]["cluster"]
+ elif "cluster" not in WData.columns:
+ WData["cluster"] = np.nan
+ wordCoord = WData[["Dim1", "Dim2", "word", "cluster"]].copy()
+ wordCoord.rename(columns={"word": "label", "cluster": "groups"}, inplace=True)
+ contrib = CS["coord"]["contrib"].sum(axis=1) / 2
+ wordCoord["label"] = wordCoord["label"].values
+ wordCoord["contrib"] = np.array(contrib).flatten()
+
+ # Verifica che eigCorr esista prima di accedere
+ if CS["res"] is not None and hasattr(CS["res"], "eigCorr"):
+ xlabel = f"Dim 1 ({CS["res"].eigCorr["perc"].iloc[dimX-1] if len(CS["res"].eigCorr["perc"]) > dimX-1 else 0:.2f}%)"
+ ylabel = f"Dim 2 ({CS['res'].eigCorr['perc'].iloc[dimY-1] if len(CS['res'].eigCorr['perc']) > dimY-1 else 0:.2f}%)"
+ else:
+ xlabel, ylabel = "Dim 1", "Dim 2"
+
+ elif method == "MDS":
+ wordCoord = WData[["Dim1", "Dim2", "word", "cluster"]].copy()
+ wordCoord.rename(columns={"word": "label", "cluster": "groups"}, inplace=True)
+ wordCoord.rename(columns={"word": "label", "cluster": "groups"}, inplace=True)
+ wordCoord["contrib"] = size / 2 # MDS non ha contribuzioni vere
+ xlabel, ylabel = "Dim 1", "Dim 2"
+
+
+ ymax = wordCoord["Dim2"].max() - wordCoord["Dim2"].min()
+ xmax = wordCoord["Dim1"].max() - wordCoord["Dim1"].min()
+ threshold2 = threshold * np.mean([xmax, ymax])
+
+ wordCoord["dotSize"] = wordCoord["contrib"] + size
+ wordCoord["dotSize"] = wordCoord["dotSize"].replace([np.inf, -np.inf], np.nan)
+ wordCoord["dotSize"] = wordCoord["dotSize"].fillna(1)
+ wordCoord["dotSize"] = wordCoord["dotSize"].clip(lower=1)
+ topWordPlot_safe = min(int(topWordPlot) if np.isfinite(topWordPlot) else len(wordCoord), len(wordCoord) - 1)
+ thres = sorted(wordCoord["dotSize"], reverse=True)[topWordPlot_safe]
+ wordCoord["labelToPlot"] = np.where(wordCoord["dotSize"] >= thres, wordCoord["label"], "")
+
+ # Avoid label overlapping
+ # Placeholder for avoidOverlaps logic
+ # labelToRemove = avoidOverlaps(wordCoord, threshold=threshold2, dimX=dimX, dimY=dimY)
+ # wordCoord["labelToPlot"] = np.where(wordCoord["labelToPlot"].isin(labelToRemove), "", wordCoord["labelToPlot"])
+ # wordCoord["label"] = wordCoord["label"].str.replace("_1", "", regex=False)
+ # wordCoord["labelToPlot"] = wordCoord["labelToPlot"].str.replace("_1", "", regex=False)
+
+
+ ####################################### WORD MAP #######################################
+ # Palette cluster
+ group_colors = assign_consistent_colors(wordCoord["groups"])
+
+ # Hover arricchito
+ hoverText = [
+ f"{row['label']}
Cluster: {row['groups'] if 'groups' in row else ''}
Contrib: {row['contrib']:.3f}"
+ for _, row in wordCoord.iterrows()
+ ]
+
+ fig = go.Figure()
+
+ # Marker colorati per cluster, trasparenti, bordo sottile
+ for g in sorted(wordCoord["groups"].dropna().unique()):
+ group_df = wordCoord[wordCoord["groups"] == g]
+ fig.add_trace(
+ go.Scatter(
+ x=group_df["Dim1"],
+ y=group_df["Dim2"],
+ mode="markers",
+ marker=dict(
+ size=group_df["dotSize"],
+ color=group_colors.get(g, "#FF0000"), # fallback colore
+ opacity=0.7,
+ line=dict(width=0.7, color="black"),
+ symbol="circle",
+ ),
+ opacity=0.7,
+ text=group_df["label"],
+ hovertext=[
+ f"{row['label']}
Cluster: {row['groups']}
Contrib: {row['contrib']:.3f}"
+ for _, row in group_df.iterrows()
+ ],
+ hoverinfo="text",
+ name=f"Cluster {g}",
+ showlegend=False,
+ )
+ )
+
+ # Aggiungi i NaN separatamente (se esistono)
+ group_df_nan = wordCoord[wordCoord["groups"].isna()]
+ if not group_df_nan.empty:
+ fig.add_trace(
+ go.Scatter(
+ x=group_df_nan["Dim1"],
+ y=group_df_nan["Dim2"],
+ mode="markers",
+ marker=dict(
+ size=group_df_nan["dotSize"],
+ color="#FF9999",
+ opacity=0.7,
+ line=dict(width=0.7, color="black"),
+ symbol="circle",
+ ),
+ opacity=0.7,
+ text=group_df_nan["label"],
+ hovertext=[
+ f"{row['label']}
Cluster: N/A
Contrib: {row['contrib']:.3f}"
+ for _, row in group_df_nan.iterrows()
+ ],
+ hoverinfo="text",
+ name="No Cluster",
+ showlegend=False,
+ )
+ )
+
+ # Aggiungi contorni dei cluster (Convex Hull)
+ if n_clusters != 1 and "hull_data" in CS and CS["hull_data"] is not None and not CS["hull_data"].empty:
+ hull_data = CS["hull_data"]
+ for cluster_id in hull_data["cluster"].unique():
+ group = hull_data[hull_data["cluster"] == cluster_id]
+ fig.add_trace(
+ go.Scatter(
+ x=group["Dim1"],
+ y=group["Dim2"],
+ mode="lines",
+ line=dict(color=group_colors.get(cluster_id, "gray"), width=2),
+ fill="toself",
+ opacity=0.15,
+ hoverinfo="skip",
+ showlegend=False
+ )
+ )
+
+ # Etichette solo per i top word (labelToPlot), spostate più in alto rispetto ai pallini
+ # Offset dinamico in base alla dimensione verticale del grafico
+ label_offset = 0.03 * (wordCoord["Dim2"].max() - wordCoord["Dim2"].min())
+
+ for _, row in wordCoord[wordCoord["labelToPlot"] != ""].iterrows():
+ fig.add_annotation(
+ x=row["Dim1"],
+ y=row["Dim2"] + label_offset,
+ text=row["labelToPlot"],
+ font=dict(size=labelsize, color=group_colors.get(row["groups"], "black")),
+ showarrow=False,
+ )
+
+ # Assi X=0 e Y=0, grigi e tratteggiati
+ fig.add_shape(
+ type="line",
+ x0=wordCoord["Dim1"].min(),
+ x1=wordCoord["Dim1"].max(),
+ y0=0,
+ y1=0,
+ line=dict(color="#B0B0B0", width=1.5, dash="dash"),
+ layer="below"
+ )
+ fig.add_shape(
+ type="line",
+ x0=0,
+ x1=0,
+ y0=wordCoord["Dim2"].min(),
+ y1=wordCoord["Dim2"].max(),
+ line=dict(color="#B0B0B0", width=1.5, dash="dash"),
+ layer="below"
+ )
+
+ # Personalizza l'hovertemplate per renderlo leggibile e carino
+ for trace in fig.data:
+ trace.hovertemplate = (
+ "%{text}
"
+ "Cluster: %{marker.color}
"
+ "Contribuzione: %{marker.size:.2f}"
+ )
+
+ fig.update_layout(
+ xaxis=dict(
+ title=xlabel,
+ zeroline=True,
+ zerolinewidth=1.5,
+ zerolinecolor="#B0B0B0",
+ showgrid=True,
+ gridcolor="lightgray",
+ showline=False,
+ showticklabels=True
+ ),
+ yaxis=dict(
+ title=ylabel,
+ zeroline=True,
+ zerolinewidth=1.5,
+ zerolinecolor="#B0B0B0",
+ showgrid=True,
+ gridcolor="lightgray",
+ showline=False,
+ showticklabels=True
+ ),
+ plot_bgcolor="rgba(0,0,0,0)",
+ paper_bgcolor="rgba(0,0,0,0)",
+ showlegend=False,
+ height=800,
+ hoverlabel=dict(
+ bgcolor="white",
+ font_size=13,
+ font_family="Segoe UI, Arial",
+ bordercolor="#5567BB"
+ ),
+ )
+ fig = go.FigureWidget(fig)
+ fig._config = fig._config | {'modeBarButtonsToRemove': ['pan', 'select', 'lasso2d', 'toImage'],
+ 'displaylogo': False}
+
+ #####################################################################################
+
+ ################################### DENDROGRAM COERENTE CON WORD MAP ###################################
+ import networkx as nx
+ from pyvis.network import Network
+ from scipy.cluster.hierarchy import linkage, to_tree
+ from pathlib import Path
+ from scipy.cluster.hierarchy import optimal_leaf_ordering
+ from scipy.spatial.distance import pdist
+ import math
+ import tempfile
+ import os
+
+ # 1. Linkage, labels, cluster mapping
+ labels_lower = CS["km_res"]["data"].index.str.lower().tolist()
+ coords = CS["km_res"]["data"][["Dim1", "Dim2"]].values
+ linkage_matrix = CS["linkage"]
+
+ word_to_cluster = dict(zip(WData["word"], WData["cluster"]))
+ group_colors = assign_consistent_colors(WData["cluster"])
+ leaf_offset = len(labels_lower)
+
+ # 2. Ordina le parole secondo dendrogramma
+ ddata = dendrogram(linkage_matrix, labels=labels_lower, no_plot=True)
+ words_sorted = ddata["ivl"]
+ n_terms = len(words_sorted)
+ scale_factor = int(500 * math.log2(n_terms + 1)) # log-scale vertical height
+
+ # 3. Inizializza rete Pyvis
+ tree, nodes = to_tree(linkage_matrix, rd=True)
+ net = Network(height="98vh", width="100%", directed=True, notebook=True, cdn_resources="in_line")
+ net.toggle_physics(False)
+ positions = {}
+ label_boxes = []
+ node_to_cluster = {}
+
+ leaf_x = 0
+ x_spacing = 100
+ label_to_new_index = {label: i for i, label in enumerate(words_sorted)}
+
+ # Per memorizzare cambi cluster
+ cut_lines = {}
+
+ # FOGUE
+ for i, label in enumerate(words_sorted):
+ node_id = i
+ x = leaf_x
+ y = 0
+ cluster = word_to_cluster.get(label.lower(), -1)
+ color = group_colors.get(cluster, "#999999")
+ node_to_cluster[node_id] = cluster
+ positions[node_id] = (x, y)
+
+ # Nodo foglia
+ net.add_node(
+ node_id,
+ label=" ",
+ color=color,
+ shape="dot",
+ size=6,
+ title=label,
+ font={"size": 18, "face": "arial"},
+ physics=False,
+ x=x,
+ y=y + 40
+ )
+
+ # Nodo stub
+ stub_y = y - 20
+ stub_id = f"stub_{node_id}"
+ positions[stub_id] = (x, stub_y)
+ net.add_node(
+ stub_id,
+ label=" ",
+ title=" ",
+ color="#00000000",
+ shape="dot",
+ size=1,
+ physics=False,
+ x=x,
+ y=stub_y,
+ font={"color": "#00000000", "size": 1}
+ )
+
+ net.add_edge(
+ stub_id,
+ node_id,
+ label=" ",
+ color=color,
+ width=10,
+ smooth=False,
+ physics=False,
+ arrows=""
+ )
+
+ # Label HTML dinamica
+ box_html = f"""
+
+ {label.upper()}
+
+ """
+ label_boxes.append(box_html)
+ leaf_x += x_spacing
+
+ # MERGE
+ def add_internal_nodes(node):
+ if node.is_leaf():
+ label = labels_lower[node.id]
+ new_id = label_to_new_index[label]
+ stub_id = f"stub_{new_id}"
+ return positions[stub_id], stub_id
+
+ # 1. Ricorsione sui figli
+ left_pos, left_stub_id = add_internal_nodes(node.left)
+ right_pos, right_stub_id = add_internal_nodes(node.right)
+
+ # 2. Coordinate del nodo interno
+ x_center = (left_pos[0] + right_pos[0]) / 2
+ y = min(left_pos[1], right_pos[1])
+ max_dist = linkage_matrix[:, 2].max()
+ stub_y = distance_to_y(node.dist, max_dist, scale_factor)
+
+
+ node_id = node.id + leaf_offset
+ stub_id = f"stub_{node_id}"
+ positions[node_id] = (x_center, y)
+ positions[stub_id] = (x_center, stub_y)
+ total = node.count
+
+ # 3. Colore cluster (ereditato dal figlio sinistro)
+ left_cluster = node_to_cluster.get(
+ node.left.id + leaf_offset if not node.left.is_leaf() else label_to_new_index[labels_lower[node.left.id]],
+ -1
+ )
+ right_cluster = node_to_cluster.get(
+ node.right.id + leaf_offset if not node.right.is_leaf() else label_to_new_index[labels_lower[node.right.id]],
+ -1
+ )
+
+ cluster = left_cluster
+ node_to_cluster[node_id] = cluster
+ color = group_colors.get(cluster, "#999999")
+
+ # 4. Nodo interno
+ net.add_node(
+ node_id,
+ label=" ",
+ shape="dot",
+ size=20,
+ physics=False,
+ x=x_center,
+ y=y,
+ title=f"Distance: {node.dist:.2f} Words: {total}",
+ color={
+ "background": "#FFFFFF", # Riempimento bianco
+ "border": "#3399FF", # Bordo blu tenue
+ "highlight": "#000000" # Colore al passaggio mouse (opzionale)
+ },
+ borderWidth=2,
+ )
+
+
+ # 5. Nodo stub sopra
+ net.add_node(
+ stub_id,
+ label=" ",
+ title=f"Distance: {node.dist:.2f} Words: {total}",
+ color="#00000000",
+ shape="dot",
+ size=4,
+ physics=False,
+ x=x_center,
+ y=stub_y,
+ font={"color": "#00000000", "size": 1}
+ )
+
+ # 6. Edge verticale (stub → nodo)
+ if node != tree:
+ net.add_edge(
+ stub_id,
+ node_id,
+ label=" ",
+ title=f"Distance: {node.dist:.2f} Words: {node.count}",
+ color=color,
+ width=10,
+ smooth=False,
+ physics=False,
+ arrows=""
+ )
+
+ # 7. Collega i due figli
+ for child_stub_id in [left_stub_id, right_stub_id]:
+ child_x, child_y = positions[child_stub_id]
+ inter_id = f"{node_id}_{child_stub_id}_v"
+ inter_y = y
+
+ net.add_node(
+ inter_id,
+ label=" ",
+ title=" ",
+ color="#00000000",
+ shape="dot",
+ size=1,
+ physics=False,
+ x=child_x,
+ y=inter_y
+ )
+
+ # print(f"[HLINE] Nodo {node_id} connesso a {child_stub_id} a y={inter_y:.2f}")
+
+ net.add_edge(
+ node_id,
+ inter_id,
+ color=color,
+ title=f"Distance: {node.dist:.2f} Words: {node.count}",
+ width=10,
+ smooth=False,
+ physics=False,
+ arrows=""
+ )
+ net.add_edge(
+ inter_id,
+ child_stub_id,
+ color=color,
+ title=f"Distance: {node.dist:.2f} Words: {node.count}",
+ width=10,
+ smooth=False,
+ physics=False,
+ arrows=""
+ )
+
+ # 8. Linea di taglio (se cambia cluster)
+ left_leaf_clusters = get_leaf_clusters(node.left, label_to_new_index, labels_lower, node_to_cluster)
+ right_leaf_clusters = get_leaf_clusters(node.right, label_to_new_index, labels_lower, node_to_cluster)
+
+ if left_leaf_clusters.isdisjoint(right_leaf_clusters):
+ cl1 = min(left_leaf_clusters)
+ cl2 = min(right_leaf_clusters)
+ cluster_pair = tuple(sorted((cl1, cl2)))
+ if cluster_pair not in cut_lines:
+ cut_lines[cluster_pair] = y # posizione reale della fusione visibile
+ # print(f"[CUT LINE] Cambio cluster {cluster_pair} a y = {stub_y:.2f}")
+
+
+ return (x_center, stub_y), stub_id
+
+ # Costruisci
+ _, root_stub_id = add_internal_nodes(tree)
+
+ # Aggiungi linee rosse di taglio
+ # Aggiungi solo la linea di taglio più bassa (cioè y più vicino allo 0)
+ if cut_lines:
+ # Trova la coppia con il max y (cioè la linea di taglio più bassa visivamente)
+ (cl1, cl2), y = max(cut_lines.items(), key=lambda x: x[1])
+
+ net.add_node(
+ f"cut_{cl1}_{cl2}_left", x=0, y=y, label="", shape="dot", size=0.1, color="#FF0000", physics=False
+ )
+ net.add_node(
+ f"cut_{cl1}_{cl2}_right", x=(leaf_x - x_spacing), y=y, label="", shape="dot", size=0.1, color="#FF0000", physics=False
+ )
+ net.add_edge(
+ f"cut_{cl1}_{cl2}_left",
+ f"cut_{cl1}_{cl2}_right",
+ label=f"cut @ y={y:.1f}",
+ color="#FF0000",
+ width=20,
+ physics=False,
+ arrows=""
+ )
+
+ # 1. Salva grafo base in HTML
+ html = net.generate_html()
+
+ # 2. Inietta etichette HTML
+ injection = f"""
+
+ {''.join(label_boxes)}
+
+ """
+
+ html = html.replace("