Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
412c9b4
Add get_top_ranked_genes
happymealinthebuilding Apr 11, 2025
77bd5ef
Add rename_clusters function to rename cluster labels in adata.obs
happymealinthebuilding Apr 20, 2025
d21df2e
Move rename_clusters function to modules directory and update impleme…
happymealinthebuilding Apr 20, 2025
9e65fce
Update .gitignore to include data directory
happymealinthebuilding Apr 20, 2025
af76b89
Add dotplot visualization for marker gene expression in single-cell R…
happymealinthebuilding Apr 20, 2025
bcbe0e0
Add tutorial comment to demo_get_top_ranked_genes.py
happymealinthebuilding Apr 20, 2025
ce53b0b
Remove tutorial comment from demo_get_top_ranked_genes.py
happymealinthebuilding Apr 20, 2025
6c83a44
Add Tutorial File
happymealinthebuilding Apr 20, 2025
66685d0
Remove tutorial file from the project
happymealinthebuilding Apr 20, 2025
fb150a9
Add files via upload
Ekin-hub-code Apr 25, 2025
655ed2d
Added marked_genes_demo.py file
eceygtt Apr 26, 2025
e227aa5
Update marked_genes.py
Ekin-hub-code Apr 27, 2025
96cc703
Add rename_cluster_demo.py to demonstrate renaming cluster labels in …
happymealinthebuilding Apr 28, 2025
f919f47
Add get_top_ranked_genes_demo function
happymealinthebuilding Apr 28, 2025
a5d9159
Add dotplot visualization demo for marker gene expression in single-c…
happymealinthebuilding Apr 28, 2025
31efc44
Merge pull request #29 from IEEE-Ege/dotplot_adata_marker_genes_
happymealinthebuilding May 23, 2025
f3eff81
Merge branch 'feature/finding_marker_genes' into adata_rename_categor…
happymealinthebuilding May 23, 2025
15cdfc4
Merge pull request #28 from IEEE-Ege/adata_rename_categories_chosen_a…
happymealinthebuilding May 23, 2025
59cc3d5
Merge pull request #27 from IEEE-Ege/get_top_ranked_genes_adata_top_n
happymealinthebuilding May 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,5 @@ cython_debug/

# PyPI configuration file
.pypirc


23 changes: 23 additions & 0 deletions src/modules/dotplot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import scanpy as sc

def visualize_marker_genes(adata, marker_genes, cluster_key='leiden'):
"""
Visualizes marker gene expression across annotated cell clusters.

Parameters:
adata (AnnData): Annotated data matrix containing single-cell RNA-seq data.
marker_genes (list or dict): Marker genes to visualize.
cluster_key (str): Key in `adata.obs` to group cells by (default: 'leiden').

Returns:
None: Displays dotplot and stacked violin plot of marker gene expression.
"""

# Dotplot to show average expression and percent of expressing cells
sc.pl.dotplot(adata, marker_genes, groupby=cluster_key)

# Stacked violin plot to show gene expression distribution per cluster
sc.pl.stacked_violin(adata, marker_genes, groupby=cluster_key, rotation=90)



72 changes: 72 additions & 0 deletions src/modules/dotplot_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import scanpy as sc
import warnings
import pandas as pd

def visualize_marker_genes(adata, marker_genes, cluster_key='leiden'):
"""
Visualizes marker gene expression across annotated cell clusters.

Parameters:
adata (AnnData): Annotated data matrix containing single-cell RNA-seq data.
marker_genes (list or dict): Marker genes to visualize. Can be a list or a dict (for grouped markers).
cluster_key (str): Key in `adata.obs` to group cells by (default: 'leiden').

Returns:
None: Displays dotplot and stacked violin plot of marker gene expression.
"""
# Check if cluster_key exists in adata.obs columns
if cluster_key not in adata.obs.columns:
raise ValueError(f"'{cluster_key}' column not found in adata.obs. Available columns: {list(adata.obs.columns)}")

# Validate marker genes depending on type (list or dict)
if isinstance(marker_genes, dict):
# Flatten the list of genes from the dictionary values for validation
flat_genes = [gene for genes in marker_genes.values() for gene in genes]
elif isinstance(marker_genes, list):
flat_genes = marker_genes
else:
# Raise an error if marker_genes is not a list or dictionary
raise TypeError("marker_genes must be a list or a dictionary of gene lists.")

# Check gene existence in adata.var_names
valid_genes = []
missing_genes = []
for gene in flat_genes:
if gene in adata.var_names:
valid_genes.append(gene)
else:
missing_genes.append(gene)

# Warn the user about any genes that were not found
if missing_genes:
warnings.warn(f"The following genes are not found in adata.var_names and will be ignored: {missing_genes}")

# If no valid genes are left after checking, raise an error
if not valid_genes:
raise ValueError("None of the provided marker genes were found in adata.var_names.")

# Filter marker_genes based on valid_genes
if isinstance(marker_genes, dict):
# Filter genes within each group in the dictionary
marker_genes = {k: [g for g in v if g in valid_genes] for k, v in marker_genes.items()}
# Remove any groups that become empty after filtering
marker_genes = {k: v for k, v in marker_genes.items() if v}

else:
# If it was a list, just use the list of valid genes
marker_genes = valid_genes

# Generate the dotplot
print("Generating dotplot...")
sc.pl.dotplot(adata, marker_genes, groupby=cluster_key)
print("Dotplot generated.")

# Generate the stacked violin plot
print("Generating stacked violin plot...")
sc.pl.stacked_violin(adata, marker_genes, groupby=cluster_key, rotation=90)
print("Stacked violin plot generated.")

# Note: These plotting functions typically display the plots automatically in interactive environments
# If running as a script, you might need to add:
# import matplotlib.pyplot as plt
# plt.show()
38 changes: 38 additions & 0 deletions src/modules/get_top_ranked_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pandas as pd

def extract_top_genes(adata, n_top=5):
"""
Extracts top-ranked genes and p-values from adata.uns['rank_genes_groups'].

Parameters:
- adata: AnnData object after rank_genes_groups has been run.
- n_top: Number of top genes to return (default is 5).

Returns:
- A tuple of two DataFrames:
1. Top gene names per group.
2. A combined DataFrame of top genes and p-values.
"""

if 'rank_genes_groups' not in adata.uns:
raise ValueError("No 'rank_genes_groups' results found in adata.uns. Did you run `sc.tl.rank_genes_groups`?")

result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names

top_gene_names = pd.DataFrame(result['names']).head(n_top)

combined_df = pd.DataFrame({
f"{group}_n": result['names'][group][:n_top]
for group in groups
})
combined_df_pvals = pd.DataFrame({
f"{group}_p": result['pvals'][group][:n_top]
for group in groups
})

combined = pd.concat([combined_df, combined_df_pvals], axis=1)

return top_gene_names, combined


43 changes: 43 additions & 0 deletions src/modules/marked_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import scanpy as sc
import pandas as pd

# Load the dataset
adata = sc.datasets.pbmc3k()

# Log-transform the raw count data (required for differential expression analysis)
sc.pp.log1p(adata)

# Apply PCA and calculate neighbors
sc.pp.pca(adata, svd_solver='arpack') # Apply PCA
sc.pp.neighbors(adata, n_pcs=40) # Calculate neighbors after PCA

# Run Leiden algorithm (with flavor='igraph' for future compatibility)
sc.tl.leiden(adata, resolution=1.0, flavor="igraph", directed=False, n_iterations=2)

# Run differential expression analysis to rank genes
sc.tl.rank_genes_groups(adata, groupby='leiden', method='t-test')

# Save the adata object with the rank_genes_groups results
adata.write("ranked_genes_results.h5ad") # Save the results to a file

# Set verbosity level
sc.settings.verbosity = 2

# Define the function to get marker genes
def get_marker_genes(adata, results_file):
# Read the results file
adata = sc.read(results_file)

# Check if 'rank_genes_groups' exists in adata.uns
if 'rank_genes_groups' in adata.uns:
# Extract the top 5 marker genes as a DataFrame
df = pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)
return df
else:
print("Error: 'rank_genes_groups' not found in adata.uns.")
return None

# Run the function to get marker genes
marker_genes = get_marker_genes(adata, results_file="ranked_genes_results.h5ad")
if marker_genes is not None:
print(marker_genes)
Empty file.
25 changes: 25 additions & 0 deletions src/modules/rename_categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
def rename_clusters(adata, cluster_algo: str, new_cluster_names: list):
"""
Renames cluster labels in adata.obs using a chosen clustering algorithm key and new names.

Parameters:
adata (AnnData): The annotated data matrix.
cluster_algo (str): The key in `adata.obs` representing clustering (e.g., 'leiden', 'louvain').
new_cluster_names (list): A list of new cluster names.

Returns:
None: Updates adata in place.
"""

if cluster_algo not in adata.obs:
raise ValueError(f"'{cluster_algo}' not found in adata.obs. Make sure clustering has been run.")

n_clusters = len(adata.obs[cluster_algo].cat.categories)

if len(new_cluster_names) != n_clusters:
raise ValueError(f"Number of new names ({len(new_cluster_names)}) does not match number of clusters ({n_clusters}).")

adata.rename_categories(cluster_algo, new_cluster_names)
print(f"Clusters renamed using '{cluster_algo}'.")


69 changes: 69 additions & 0 deletions src/modules/rename_cluster_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import numpy as np
import pandas as pd
from anndata import AnnData
import scanpy as sc

# 1. Create dummy data
X = np.random.rand(100, 10) # 100 cells, 10 genes
var_names = [f"Gene{i}" for i in range(10)]
obs = pd.DataFrame({
# Create a 'leiden' column with categorical values ('0', '1', '2')
'leiden': pd.Categorical(np.random.choice(['0', '1', '2'], size=100))
})
# Create the AnnData object
adata = AnnData(X=X, obs=obs)
adata.var_names = var_names # Set the gene names

print("Dummy AnnData object created with 'leiden' clusters.")
print(f" Original cluster categories:\n{adata.obs['leiden'].cat.categories}")


# 2. Define the new cluster names
# Ensure the number of new names matches the number of categories in 'leiden'
new_cluster_names = ['CD4 T cells', 'Monocytes', 'B cells']
print(f"\nDefined new cluster names: {new_cluster_names}")


# 3. Define the rename_clusters function
def rename_clusters(adata, cluster_algo: str, new_cluster_names: list):
"""
Renames cluster labels in adata.obs using a chosen clustering algorithm key and new names.

Parameters:
adata (AnnData): The annotated data matrix.
cluster_algo (str): The key in `adata.obs` representing clustering (e.g., 'leiden', 'louvain').
new_cluster_names (list): A list of new cluster names.

Returns:
None: Updates adata in place.
"""
print(f"Attempting to rename clusters for column '{cluster_algo}'...")

# Check if the cluster key exists in adata.obs
if cluster_algo not in adata.obs:
raise ValueError(f"'{cluster_algo}' not found in adata.obs. Make sure clustering has been performed.")

# Check if the column is categorical, as rename_categories works on categorical columns
if not pd.api.types.is_categorical_dtype(adata.obs[cluster_algo]):
raise TypeError(f"'{cluster_algo}' column must be categorical.")

# Get the number of existing clusters (categories)
n_clusters = len(adata.obs[cluster_algo].cat.categories)

# Check if the number of new names matches the number of clusters
if len(new_cluster_names) != n_clusters:
raise ValueError(f"Expected {n_clusters} new names for '{cluster_algo}', but got {len(new_cluster_names)}.")

# Perform the renaming
adata.rename_categories(cluster_algo, new_cluster_names)
print(f" Cluster names updated successfully in '{cluster_algo}'.")


# 4. Apply the function
print("\nApplying rename_clusters function...")
rename_clusters(adata, cluster_algo='leiden', new_cluster_names=new_cluster_names)


# 5. See the results
print("\n Updated cluster names:")
print(adata.obs['leiden'].cat.categories)
50 changes: 50 additions & 0 deletions src/modules/topranked_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd

def get_top_ranked_genes(adata, top_n=5):
"""
Retrieves the top `top_n` marker genes for each cluster from
`adata.uns['rank_genes_groups']`.

Parameters:
- adata (AnnData): Annotated data matrix after differential expression analysis (`sc.tl.rank_genes_groups`).
- top_n (int): Number of top-ranked genes to retrieve for each cluster.

Returns:
- pd.DataFrame: Long-format DataFrame with 'cluster', 'rank', 'gene_name', 'pval', and 'score'.
"""
# Check if 'rank_genes_groups' exists in adata.uns
if 'rank_genes_groups' not in adata.uns:
raise ValueError("No 'rank_genes_groups' found in adata.uns. Run `sc.tl.rank_genes_groups()` first.")

# Validate the top_n parameter
if not isinstance(top_n, int) or top_n <= 0:
raise ValueError("`top_n` must be a positive integer.")

# Get the results dictionary
result = adata.uns['rank_genes_groups']
# Extract cluster names from the result structure
groups = result['names'].dtype.names

data = [] # List to store data for the DataFrame
# Iterate through each cluster
for cluster in groups:
# Extract top_n gene names for the current cluster
gene_names = result['names'][cluster][:top_n]
# Extract top_n p-values, using .get to handle potential absence of 'pvals'
pvals = result.get('pvals', {}).get(cluster, [None]*top_n)
# Extract top_n scores, using .get to handle potential absence of 'scores'
scores = result.get('scores', {}).get(cluster, [None]*top_n)

# Iterate through the top genes and their associated values for the cluster
for rank, (gene, pval, score) in enumerate(zip(gene_names, pvals, scores), 1):
# Append a dictionary for each gene to the data list
data.append({
'cluster': cluster,
'rank': rank,
'gene_name': gene,
'pval': pval,
'score': score
})

# Create and return a pandas DataFrame from the collected data
return pd.DataFrame(data)