Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .codespell-ignore.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ Strack
Tennant
Udo
Yau
Mathes
Smal

# Technical or code related terms
aCount
Expand Down
6,667 changes: 5,817 additions & 850 deletions content/contributors/tenzing.md

Large diffs are not rendered by default.

45 changes: 45 additions & 0 deletions scripts/forrt_contribs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# FORRT Contributors Data Generation

This directory contains the script and template for generating the Contributors page.

## Files

- `tenzing.py` - Python script that fetches contributor data from Google Sheets and generates the `tenzing.md` file
- `tenzing_template.md` - Template file with frontmatter, page structure, and CSS styles
- `tenzing.md` - Generated output file (copied to `content/contributors/tenzing.md` after generation)

The JavaScript file implementing filtering features is located at `static/js/contributor-filter.js`.

## How the Data is Generated

The `tenzing.py` script:

1. Fetches data from:
- The Tenzing index ("Tenzing Automation Source" sheet)
- The "FORRT Lead Tenzing Sheet"

Error Handling: If any project sheets fail to load, the script logs the failures to `tenzing_failures.json`, which triggers a GitHub workflow to create an issue for investigation.

2. Processes the data to:
- Consolidate each person's contributions across FORRT projects
- Generate HTML for display on the Contributors page
- Add `data-*` attributes to enable filtering by project/role
- Add `id` attributes (when ORCID is available) to enable anchor links (e.g., `https://forrt.org/contributors#0000-0000-0000-0000`)
- Generate a JSON object with all unique projects and roles to populate filter dropdown menus

3. Creates the final output by:
- Reading `tenzing_template.md`
- Appending the generated HTML
- Writing to `tenzing.md`


**Important:** `tenzing.md` is auto-generated and should never be edited manually.

## Local Development

When working with `tenzing.py` locally, copy the generated file to the content directory before rendering the site:

```
cp scripts/forrt_contribs/tenzing.md content/contributors/tenzing.md
hugo server
```
6,605 changes: 5,818 additions & 787 deletions scripts/forrt_contribs/tenzing.md

Large diffs are not rendered by default.

208 changes: 180 additions & 28 deletions scripts/forrt_contribs/tenzing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pandas as pd
import os
import re
import html
import json

def print_failures(failed_sheets):
Expand Down Expand Up @@ -41,7 +43,6 @@ def print_failures(failed_sheets):
# Make sure each URL is transformed into a CSV export URL as shown above
data_frame = pd.read_csv(url)

# --- LOGGING ADDED HERE ---
# Log the number of contributors read from the current project
print(f"✓ Read {len(data_frame)} contributors from '{project_name}'.")

Expand Down Expand Up @@ -79,8 +80,16 @@ def print_failures(failed_sheets):

def concatenate_true_columns(row, columns):
true_columns = [col for col in columns if pd.notna(row[col]) and row[col]]
if 'Project Managers' in true_columns:
other_columns = [f'*{col}*' for col in true_columns if col != 'Project Managers']

# Check for both "Project Managers" and "Project manager" (case variations)
pm_column = None
for col in true_columns:
if col.lower() == 'project managers' or col.lower() == 'project manager':
pm_column = col
break

if pm_column:
other_columns = [f'*{col}*' for col in true_columns if col != pm_column]
if other_columns:
return 'as Project Manager and with ' + ', '.join(other_columns[:-1]) + (' and ' if len(other_columns) > 1 else '') + other_columns[-1]
else:
Expand All @@ -90,6 +99,7 @@ def concatenate_true_columns(row, columns):

# List of column names to check for TRUE values
fields_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vT_IaXiYtB3iAmtDZ_XiQKrToRkxOlkXNAeNU2SIT_J9PxvsQyptga6Gg9c8mSvDZpwY6d8skswIQYh/pub?output=csv&gid=277271370"

try:
column_mappings = pd.read_csv(fields_url)
print(f"✓ Successfully loaded column mappings with {len(column_mappings)} fields")
Expand All @@ -107,6 +117,7 @@ def concatenate_true_columns(row, columns):
# Filtering rows based on the updated columns_to_check list
# Note: columns_to_check needs to be updated to the renamed columns for the filter to work correctly
columns_to_check = [rename_dict[col] for col in columns_to_check if col in rename_dict]

# Remove columns not present
columns_present = [col for col in columns_to_check if col in merged_data.columns]
columns_dropped = set(columns_to_check) - set(columns_present)
Expand Down Expand Up @@ -141,7 +152,7 @@ def concatenate_true_columns(row, columns):

# Sort based on surname
merged_data['sort_order'] = merged_data['Surname']
merged_data = merged_data.sort_values(by='sort_order')
merged_data = merged_data.sort_values(by='sort_order')
merged_data = merged_data.drop(columns='sort_order')

# Strip spaces from 'ORCID iD' in merged data
Expand All @@ -152,7 +163,7 @@ def format_name(row):
# Extract the first name, middle name initial, and surname
first_name = row['First name'].strip() if pd.notna(row['First name']) else ""
middle_name = row['Middle name']
surname = row['Surname'].strip() if pd.notna(row['Surname']) else ""
surname = row['Surname'].strip().rstrip('*') if pd.notna(row['Surname']) else ""

# Check if the middle name is not NaN and not an empty string
if pd.notna(middle_name) and middle_name != '':
Expand All @@ -169,35 +180,129 @@ def format_name(row):
# Propagate ORCID iD within each contributor's grouping
merged_data['ORCID iD'] = merged_data.groupby('full_name')['ORCID iD'].transform(lambda x: x.ffill().bfill())

# Helper function to normalize project/role names for data attributes
def normalize_for_attribute(text):
"""Normalize text for use in HTML data-* attributes."""
if pd.isna(text) or text == '':
return ''

# Lowercase + trim
name = text.lower().strip()

# Replace & with 'and'
name = name.replace('&', 'and')

# Replace ANY non-alphanumeric sequence with a hyphen
name = re.sub(r'[^a-z0-9]+', '-', name)

# Collapse multiple hyphens
name = re.sub(r'-+', '-', name)

# Remove leading/trailing hyphens
name = name.strip('-')

return name


# Group by 'ORCID iD' and concatenate the contributions
def concatenate_contributions(group):

# Find the minimum original order for the group
def concatenate_contributions(group):
# Minimum original order for sorting later
min_order = group['original_order'].min()

# Format the full name once per group
# Format name once
full_name = format_name(group.iloc[0])
group = group.sort_values(by='special_role', ascending=False)

# Create the contributions string for each project
contributions = [
f"{row['Project Name']} {('as' if row['special_role'] else '')} {row['Contributions']}" if pd.isna(row['Project URL']) or row['Project URL'] == ''
else f"[{row['Project Name']}]({row['Project URL']}) {('as' if row['special_role'] else '')} {row['Contributions']}"
for _, row in group.iterrows()
]

# Add numbering only if there are more than 1 contributions
if len(contributions) > 1:
contributions = [f"{i+1}. {contribution}" for i, contribution in enumerate(contributions)]

# Turn contributions into multiline list or single line
contributions_str = contributions[0] if len(contributions) == 1 else '\n ' + '\n '.join(contributions) + '\n' + '{{<rawhtml>}}<br/>&nbsp;<br/> {{</rawhtml>}}'

orcid_id = group.iloc[0]['ORCID iD']

# Build name HTML
if orcid_id:
return min_order, f"- **[{full_name}]({'https://orcid.org/' + orcid_id.strip()})** contributed to {contributions_str}"
name_html = f'<strong><a href="https://orcid.org/{orcid_id.strip()}">{full_name}</a></strong>'
else:
return min_order, f"- **{full_name}** contributed to {contributions_str}"
name_html = f'<strong>{full_name}</strong>'

# Build individual contribution items
contribution_items = []

for _, row in group.iterrows():
project_name = row['Project Name']
if pd.isna(project_name) or project_name == '':
continue

# Normalize for data attributes
normalized_project = normalize_for_attribute(project_name)

# Extract roles for this specific contribution
contribution_roles = []
contributions_text = row['Contributions']

if pd.notna(contributions_text):
# Extract "Project Manager" if present (regardless of special_role)
pm_match = re.search(r'as\s+Project\s+Manager(?:\s+and\s+with)?', contributions_text, re.IGNORECASE)
if pm_match:
if 'project-manager' not in contribution_roles:
contribution_roles.append('project-manager')

# Extract special roles (for special_role=True cases)
if row['special_role']:
special_role_match = re.search(r'(?:as\s+)?(.+?)(?:\s+and\s+with|\s+and|$)', contributions_text)
if special_role_match:
special_role_text = special_role_match.group(1).strip()
normalized_special = normalize_for_attribute(special_role_text)
if normalized_special not in contribution_roles:
contribution_roles.append(normalized_special)

# Extract roles marked with *
role_matches = re.findall(r'\*([^*]+)\*', contributions_text)
for role_match in role_matches:
normalized_role = normalize_for_attribute(role_match)
if normalized_role not in contribution_roles:
contribution_roles.append(normalized_role)

# Build project HTML
if pd.notna(row['Project URL']) and row['Project URL'] != '':
project_html = f'<a href="{row["Project URL"]}">{project_name}</a>'
else:
project_html = project_name

# Convert *role* → <em>role</em>
contrib_html = re.sub(r'\*([^*]+)\*', r'<em>\1</em>', contributions_text) if pd.notna(contributions_text) else ''

# Handle special role phrasing
if row['special_role']:
# Use the actual special role text from contributions_text
full_contrib = f'{project_html} as {contributions_text}'
else:
full_contrib = f'{project_html} {contrib_html}'

# Create data attributes for this contribution
projects_attr = html.escape(normalized_project, quote=True)
roles_attr = html.escape(','.join(contribution_roles), quote=True)

# Build the contribution <li>
contribution_items.append(
f' <li class="contribution" data-projects="{projects_attr}" '
f'data-roles="{roles_attr}">{full_contrib}</li>'
)

# Build the complete contributor group HTML
contributions_list = '\n'.join(contribution_items)

# Add id attribute if ORCID exists
id_attr = f' id="{orcid_id.strip()}"' if orcid_id else ''

final_html = (
f'<li class="contributor-group"{id_attr}>\n'
f' {name_html} contributed to\n'
f' <ul class="contributions-list">\n'
f'{contributions_list}\n'
f' </ul>\n'
f'</li>\n'
)

return min_order, final_html


def extract_orcid_id(value):
if not isinstance(value, str) or len(value) < 5:
Expand All @@ -208,7 +313,6 @@ def extract_orcid_id(value):

return value

# Assuming 'data' is your DataFrame
merged_data['ORCID iD'] = merged_data['ORCID iD'].apply(extract_orcid_id)

# Creating a new column for the concatenated name
Expand All @@ -217,6 +321,9 @@ def extract_orcid_id(value):
# Apply the function to each group and create a summary DataFrame
merged_data['original_order'] = range(len(merged_data))

# Move Flavio to the end of the list
merged_data.loc[merged_data["ORCID iD"] == "0000-0001-9000-8513", 'original_order'] = 99999

# Perform the groupby operation without sorting
summary = (merged_data.groupby(merged_data['ORCID iD'].fillna(merged_data['Name']), sort=False)
.apply(concatenate_contributions)
Expand All @@ -235,7 +342,52 @@ def extract_orcid_id(value):
summary = summary.reset_index(drop=True)
summary_string = '\n\n'.join(summary['Contributions'])

# --- LOGGING ADDED HERE ---

# Get project and role names for dropdown filters

project_names = sorted(merged_data["Project Name"].dropna().unique())


role_names = list(set(columns_to_check + df_roles["Contributions"].dropna().unique().tolist()))

projects_list = sorted(
[
{"value": normalize_for_attribute(p), "label": p}
for p in project_names
if p not in (None, "")
],
key=lambda x: x["label"]
)

# Deduplicate roles by 'value', keeping the first label encountered (for solving Project Manager / Project manager issue)
roles_dict = {}
for r in role_names:
if r not in (None, ""):
normalized = normalize_for_attribute(r)
if normalized not in roles_dict:
roles_dict[normalized] = r

roles_list = sorted(
[{"value": k, "label": v} for k, v in roles_dict.items()],
key=lambda x: x["label"]
)

# Save in json format
filter_data = {
"projects": projects_list,
"roles": roles_list
}

# Add closing tags and JavaScript include
footer_content = f"""
</ul>
<script>
// Value-labels for filtering menus
window.filterData = {json.dumps(filter_data, indent=2)};
</script>
<script src="/js/contributor-filter.js"></script>
"""

# Log the final deduplicated number of contributors
print("\n--- Processing Complete ---")
print(f"Total number of unique contributors after deduplication: {len(summary)}")
Expand All @@ -255,8 +407,8 @@ def extract_orcid_id(value):
with open(template_path, 'r') as file:
template_content = file.read()

# Combine the template content with the new summary string
combined_content = template_content + summary_string
# Combine the template content with the new summary string and footer
combined_content = template_content + summary_string + footer_content

# Save the combined content to 'tenzing.md'
with open(output_path, 'w') as file:
Expand Down
Loading
Loading