From 412c9b497a758e13eea15f7495cf7f6eb255b8f1 Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Fri, 11 Apr 2025 20:07:37 +0300 Subject: [PATCH 01/15] Add get_top_ranked_genes --- src/modules/get_top_ranked_genes.py | 38 +++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 src/modules/get_top_ranked_genes.py diff --git a/src/modules/get_top_ranked_genes.py b/src/modules/get_top_ranked_genes.py new file mode 100644 index 0000000..6f7cf8e --- /dev/null +++ b/src/modules/get_top_ranked_genes.py @@ -0,0 +1,38 @@ +import pandas as pd + +def extract_top_genes(adata, n_top=5): + """ + Extracts top-ranked genes and p-values from adata.uns['rank_genes_groups']. + + Parameters: + - adata: AnnData object after rank_genes_groups has been run. + - n_top: Number of top genes to return (default is 5). + + Returns: + - A tuple of two DataFrames: + 1. Top gene names per group. + 2. A combined DataFrame of top genes and p-values. + """ + + if 'rank_genes_groups' not in adata.uns: + raise ValueError("No 'rank_genes_groups' results found in adata.uns. Did you run `sc.tl.rank_genes_groups`?") + + result = adata.uns['rank_genes_groups'] + groups = result['names'].dtype.names + + top_gene_names = pd.DataFrame(result['names']).head(n_top) + + combined_df = pd.DataFrame({ + f"{group}_n": result['names'][group][:n_top] + for group in groups + }) + combined_df_pvals = pd.DataFrame({ + f"{group}_p": result['pvals'][group][:n_top] + for group in groups + }) + + combined = pd.concat([combined_df, combined_df_pvals], axis=1) + + return top_gene_names, combined + + From 77bd5ef4a58b13a26fe575a2ad8ac9bf88756bcf Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Sun, 20 Apr 2025 23:06:04 +0300 Subject: [PATCH 02/15] Add rename_clusters function to rename cluster labels in adata.obs --- .gitignore | 3 +++ src/rename_categories.py | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 src/rename_categories.py diff --git a/.gitignore b/.gitignore index 0a19790..2074307 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,6 @@ cython_debug/ # PyPI configuration file .pypirc + +# add data files +data/ diff --git a/src/rename_categories.py b/src/rename_categories.py new file mode 100644 index 0000000..f3f1385 --- /dev/null +++ b/src/rename_categories.py @@ -0,0 +1,25 @@ +def rename_clusters(adata, cluster_algo: str, new_cluster_names: list): + """ + Renames cluster labels in adata.obs using a chosen clustering algorithm key and new names. + + Parameters: + adata (AnnData): The annotated data matrix. + cluster_algo (str): The key in `adata.obs` representing clustering (e.g., 'leiden', 'louvain'). + new_cluster_names (list): A list of new cluster names. + + Returns: + None: Updates adata in place. + """ + + if cluster_algo not in adata.obs: + raise ValueError(f"'{cluster_algo}' not found in adata.obs. Make sure clustering has been run.") + + n_clusters = len(adata.obs[cluster_algo].cat.categories) + + if len(new_cluster_names) != n_clusters: + raise ValueError(f"Number of new names ({len(new_cluster_names)}) does not match number of clusters ({n_clusters}).") + + adata.rename_categories(cluster_algo, new_cluster_names) + print(f"Clusters renamed using '{cluster_algo}'.") + + From d21df2e98763a9ced559ead29ba4457d0e82143f Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Sun, 20 Apr 2025 23:07:12 +0300 Subject: [PATCH 03/15] Move rename_clusters function to modules directory and update implementation --- src/{ => modules}/rename_categories.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{ => modules}/rename_categories.py (100%) diff --git a/src/rename_categories.py b/src/modules/rename_categories.py similarity index 100% rename from src/rename_categories.py rename to src/modules/rename_categories.py From 9e65fcec56654d3890ec0427e7f543aa92129d4d Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Sun, 20 Apr 2025 23:09:03 +0300 Subject: [PATCH 04/15] Update .gitignore to include data directory --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 0a19790..4125599 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,6 @@ cython_debug/ # PyPI configuration file .pypirc + +# add data file +data/ From af76b89d7aeba615ff161cef66b0fdd24a8c13bc Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Sun, 20 Apr 2025 23:13:39 +0300 Subject: [PATCH 05/15] Add dotplot visualization for marker gene expression in single-cell RNA-seq data --- src/modules/dotplot.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 src/modules/dotplot.py diff --git a/src/modules/dotplot.py b/src/modules/dotplot.py new file mode 100644 index 0000000..eeab89a --- /dev/null +++ b/src/modules/dotplot.py @@ -0,0 +1,23 @@ +import scanpy as sc + +def visualize_marker_genes(adata, marker_genes, cluster_key='leiden'): + """ + Visualizes marker gene expression across annotated cell clusters. + + Parameters: + adata (AnnData): Annotated data matrix containing single-cell RNA-seq data. + marker_genes (list or dict): Marker genes to visualize. + cluster_key (str): Key in `adata.obs` to group cells by (default: 'leiden'). + + Returns: + None: Displays dotplot and stacked violin plot of marker gene expression. + """ + + # Dotplot to show average expression and percent of expressing cells + sc.pl.dotplot(adata, marker_genes, groupby=cluster_key) + + # Stacked violin plot to show gene expression distribution per cluster + sc.pl.stacked_violin(adata, marker_genes, groupby=cluster_key, rotation=90) + + + From bcbe0e0ad7f11f675529cf688a2433da54165f3e Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Sun, 20 Apr 2025 23:25:39 +0300 Subject: [PATCH 06/15] Add tutorial comment to demo_get_top_ranked_genes.py --- src/modules/demo_get_top_ranked_genes.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/modules/demo_get_top_ranked_genes.py diff --git a/src/modules/demo_get_top_ranked_genes.py b/src/modules/demo_get_top_ranked_genes.py new file mode 100644 index 0000000..85d37e6 --- /dev/null +++ b/src/modules/demo_get_top_ranked_genes.py @@ -0,0 +1 @@ +#tutorial \ No newline at end of file From ce53b0b82f28007b36471bdc807cf7a68eeaeddb Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Sun, 20 Apr 2025 23:26:44 +0300 Subject: [PATCH 07/15] Remove tutorial comment from demo_get_top_ranked_genes.py --- src/modules/demo_get_top_ranked_genes.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/modules/demo_get_top_ranked_genes.py diff --git a/src/modules/demo_get_top_ranked_genes.py b/src/modules/demo_get_top_ranked_genes.py deleted file mode 100644 index 85d37e6..0000000 --- a/src/modules/demo_get_top_ranked_genes.py +++ /dev/null @@ -1 +0,0 @@ -#tutorial \ No newline at end of file From 6c83a4420cdbc45e13fe705af83ffd152690ffa4 Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Sun, 20 Apr 2025 23:29:31 +0300 Subject: [PATCH 08/15] Add Tutorial File --- src/modules/tutorial.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 src/modules/tutorial.py diff --git a/src/modules/tutorial.py b/src/modules/tutorial.py new file mode 100644 index 0000000..7e51135 --- /dev/null +++ b/src/modules/tutorial.py @@ -0,0 +1,2 @@ +#tutorial + From 66685d06a7524f85c2cad20408bb5ccae9711712 Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Sun, 20 Apr 2025 23:30:24 +0300 Subject: [PATCH 09/15] Remove tutorial file from the project --- src/modules/tutorial.py | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 src/modules/tutorial.py diff --git a/src/modules/tutorial.py b/src/modules/tutorial.py deleted file mode 100644 index 7e51135..0000000 --- a/src/modules/tutorial.py +++ /dev/null @@ -1,2 +0,0 @@ -#tutorial - From fb150a9de12fc9cbdf18b280f667ae1e26552e69 Mon Sep 17 00:00:00 2001 From: Ekin-hub-code Date: Fri, 25 Apr 2025 18:19:20 +0300 Subject: [PATCH 10/15] Add files via upload --- src/modules/marked_genes.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 src/modules/marked_genes.py diff --git a/src/modules/marked_genes.py b/src/modules/marked_genes.py new file mode 100644 index 0000000..3fb30d4 --- /dev/null +++ b/src/modules/marked_genes.py @@ -0,0 +1,17 @@ +import scanpy as sc +import pandas as pd + +adata="Hw3covid_Data_AllCells.h5ad" + +sc.pp.neighbors(adata) +sc.tl.leiden(adata, resolution=1.0) +sc.settings.verbosity = 2 # reduce the verbosity + + +def get_marker_genes(adata): + adata = sc.read(results_file) + pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5) + return df + + + From 655ed2d37d10c4c3830633da388db467fe845571 Mon Sep 17 00:00:00 2001 From: eceygtt Date: Sun, 27 Apr 2025 02:23:41 +0300 Subject: [PATCH 11/15] Added marked_genes_demo.py file --- src/modules/marked_genes_demo.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/modules/marked_genes_demo.py diff --git a/src/modules/marked_genes_demo.py b/src/modules/marked_genes_demo.py new file mode 100644 index 0000000..e69de29 From e227aa544a42e335a643e6e835b06ab6312b3b11 Mon Sep 17 00:00:00 2001 From: Ekin-hub-code Date: Sun, 27 Apr 2025 17:24:21 +0300 Subject: [PATCH 12/15] Update marked_genes.py --- src/modules/marked_genes.py | 42 ++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/src/modules/marked_genes.py b/src/modules/marked_genes.py index 3fb30d4..1b36dd2 100644 --- a/src/modules/marked_genes.py +++ b/src/modules/marked_genes.py @@ -1,17 +1,43 @@ import scanpy as sc import pandas as pd -adata="Hw3covid_Data_AllCells.h5ad" +# Load the dataset +adata = sc.datasets.pbmc3k() -sc.pp.neighbors(adata) -sc.tl.leiden(adata, resolution=1.0) -sc.settings.verbosity = 2 # reduce the verbosity +# Log-transform the raw count data (required for differential expression analysis) +sc.pp.log1p(adata) +# Apply PCA and calculate neighbors +sc.pp.pca(adata, svd_solver='arpack') # Apply PCA +sc.pp.neighbors(adata, n_pcs=40) # Calculate neighbors after PCA -def get_marker_genes(adata): - adata = sc.read(results_file) - pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5) - return df +# Run Leiden algorithm (with flavor='igraph' for future compatibility) +sc.tl.leiden(adata, resolution=1.0, flavor="igraph", directed=False, n_iterations=2) +# Run differential expression analysis to rank genes +sc.tl.rank_genes_groups(adata, groupby='leiden', method='t-test') +# Save the adata object with the rank_genes_groups results +adata.write("ranked_genes_results.h5ad") # Save the results to a file +# Set verbosity level +sc.settings.verbosity = 2 + +# Define the function to get marker genes +def get_marker_genes(adata, results_file): + # Read the results file + adata = sc.read(results_file) + + # Check if 'rank_genes_groups' exists in adata.uns + if 'rank_genes_groups' in adata.uns: + # Extract the top 5 marker genes as a DataFrame + df = pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5) + return df + else: + print("Error: 'rank_genes_groups' not found in adata.uns.") + return None + +# Run the function to get marker genes +marker_genes = get_marker_genes(adata, results_file="ranked_genes_results.h5ad") +if marker_genes is not None: + print(marker_genes) From 96cc703239b4005890d4bbdfe2784872d95ba117 Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Mon, 28 Apr 2025 13:33:27 +0300 Subject: [PATCH 13/15] Add rename_cluster_demo.py to demonstrate renaming cluster labels in AnnData --- src/modules/rename_cluster_demo.py | 69 ++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 src/modules/rename_cluster_demo.py diff --git a/src/modules/rename_cluster_demo.py b/src/modules/rename_cluster_demo.py new file mode 100644 index 0000000..07f22cf --- /dev/null +++ b/src/modules/rename_cluster_demo.py @@ -0,0 +1,69 @@ +import numpy as np +import pandas as pd +from anndata import AnnData +import scanpy as sc + +# 1. Create dummy data +X = np.random.rand(100, 10) # 100 cells, 10 genes +var_names = [f"Gene{i}" for i in range(10)] +obs = pd.DataFrame({ + # Create a 'leiden' column with categorical values ('0', '1', '2') + 'leiden': pd.Categorical(np.random.choice(['0', '1', '2'], size=100)) +}) +# Create the AnnData object +adata = AnnData(X=X, obs=obs) +adata.var_names = var_names # Set the gene names + +print("Dummy AnnData object created with 'leiden' clusters.") +print(f" Original cluster categories:\n{adata.obs['leiden'].cat.categories}") + + +# 2. Define the new cluster names +# Ensure the number of new names matches the number of categories in 'leiden' +new_cluster_names = ['CD4 T cells', 'Monocytes', 'B cells'] +print(f"\nDefined new cluster names: {new_cluster_names}") + + +# 3. Define the rename_clusters function +def rename_clusters(adata, cluster_algo: str, new_cluster_names: list): + """ + Renames cluster labels in adata.obs using a chosen clustering algorithm key and new names. + + Parameters: + adata (AnnData): The annotated data matrix. + cluster_algo (str): The key in `adata.obs` representing clustering (e.g., 'leiden', 'louvain'). + new_cluster_names (list): A list of new cluster names. + + Returns: + None: Updates adata in place. + """ + print(f"Attempting to rename clusters for column '{cluster_algo}'...") + + # Check if the cluster key exists in adata.obs + if cluster_algo not in adata.obs: + raise ValueError(f"'{cluster_algo}' not found in adata.obs. Make sure clustering has been performed.") + + # Check if the column is categorical, as rename_categories works on categorical columns + if not pd.api.types.is_categorical_dtype(adata.obs[cluster_algo]): + raise TypeError(f"'{cluster_algo}' column must be categorical.") + + # Get the number of existing clusters (categories) + n_clusters = len(adata.obs[cluster_algo].cat.categories) + + # Check if the number of new names matches the number of clusters + if len(new_cluster_names) != n_clusters: + raise ValueError(f"Expected {n_clusters} new names for '{cluster_algo}', but got {len(new_cluster_names)}.") + + # Perform the renaming + adata.rename_categories(cluster_algo, new_cluster_names) + print(f" Cluster names updated successfully in '{cluster_algo}'.") + + +# 4. Apply the function +print("\nApplying rename_clusters function...") +rename_clusters(adata, cluster_algo='leiden', new_cluster_names=new_cluster_names) + + +# 5. See the results +print("\n Updated cluster names:") +print(adata.obs['leiden'].cat.categories) \ No newline at end of file From f919f47defeb9a0855bff8f792ed506058403232 Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Mon, 28 Apr 2025 13:42:15 +0300 Subject: [PATCH 14/15] Add get_top_ranked_genes_demo function --- src/modules/topranked_demo.py | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/modules/topranked_demo.py diff --git a/src/modules/topranked_demo.py b/src/modules/topranked_demo.py new file mode 100644 index 0000000..3f0b76d --- /dev/null +++ b/src/modules/topranked_demo.py @@ -0,0 +1,50 @@ +import pandas as pd + +def get_top_ranked_genes(adata, top_n=5): + """ + Retrieves the top `top_n` marker genes for each cluster from + `adata.uns['rank_genes_groups']`. + + Parameters: + - adata (AnnData): Annotated data matrix after differential expression analysis (`sc.tl.rank_genes_groups`). + - top_n (int): Number of top-ranked genes to retrieve for each cluster. + + Returns: + - pd.DataFrame: Long-format DataFrame with 'cluster', 'rank', 'gene_name', 'pval', and 'score'. + """ + # Check if 'rank_genes_groups' exists in adata.uns + if 'rank_genes_groups' not in adata.uns: + raise ValueError("No 'rank_genes_groups' found in adata.uns. Run `sc.tl.rank_genes_groups()` first.") + + # Validate the top_n parameter + if not isinstance(top_n, int) or top_n <= 0: + raise ValueError("`top_n` must be a positive integer.") + + # Get the results dictionary + result = adata.uns['rank_genes_groups'] + # Extract cluster names from the result structure + groups = result['names'].dtype.names + + data = [] # List to store data for the DataFrame + # Iterate through each cluster + for cluster in groups: + # Extract top_n gene names for the current cluster + gene_names = result['names'][cluster][:top_n] + # Extract top_n p-values, using .get to handle potential absence of 'pvals' + pvals = result.get('pvals', {}).get(cluster, [None]*top_n) + # Extract top_n scores, using .get to handle potential absence of 'scores' + scores = result.get('scores', {}).get(cluster, [None]*top_n) + + # Iterate through the top genes and their associated values for the cluster + for rank, (gene, pval, score) in enumerate(zip(gene_names, pvals, scores), 1): + # Append a dictionary for each gene to the data list + data.append({ + 'cluster': cluster, + 'rank': rank, + 'gene_name': gene, + 'pval': pval, + 'score': score + }) + + # Create and return a pandas DataFrame from the collected data + return pd.DataFrame(data) \ No newline at end of file From a5d91596649203188914585d633bb3be947ae3f4 Mon Sep 17 00:00:00 2001 From: happymealinthebuilding Date: Mon, 28 Apr 2025 13:43:43 +0300 Subject: [PATCH 15/15] Add dotplot visualization demo for marker gene expression in single-cell RNA-seq data --- src/modules/dotplot_demo.py | 72 +++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 src/modules/dotplot_demo.py diff --git a/src/modules/dotplot_demo.py b/src/modules/dotplot_demo.py new file mode 100644 index 0000000..4351ed5 --- /dev/null +++ b/src/modules/dotplot_demo.py @@ -0,0 +1,72 @@ +import scanpy as sc +import warnings +import pandas as pd + +def visualize_marker_genes(adata, marker_genes, cluster_key='leiden'): + """ + Visualizes marker gene expression across annotated cell clusters. + + Parameters: + adata (AnnData): Annotated data matrix containing single-cell RNA-seq data. + marker_genes (list or dict): Marker genes to visualize. Can be a list or a dict (for grouped markers). + cluster_key (str): Key in `adata.obs` to group cells by (default: 'leiden'). + + Returns: + None: Displays dotplot and stacked violin plot of marker gene expression. + """ + # Check if cluster_key exists in adata.obs columns + if cluster_key not in adata.obs.columns: + raise ValueError(f"'{cluster_key}' column not found in adata.obs. Available columns: {list(adata.obs.columns)}") + + # Validate marker genes depending on type (list or dict) + if isinstance(marker_genes, dict): + # Flatten the list of genes from the dictionary values for validation + flat_genes = [gene for genes in marker_genes.values() for gene in genes] + elif isinstance(marker_genes, list): + flat_genes = marker_genes + else: + # Raise an error if marker_genes is not a list or dictionary + raise TypeError("marker_genes must be a list or a dictionary of gene lists.") + + # Check gene existence in adata.var_names + valid_genes = [] + missing_genes = [] + for gene in flat_genes: + if gene in adata.var_names: + valid_genes.append(gene) + else: + missing_genes.append(gene) + + # Warn the user about any genes that were not found + if missing_genes: + warnings.warn(f"The following genes are not found in adata.var_names and will be ignored: {missing_genes}") + + # If no valid genes are left after checking, raise an error + if not valid_genes: + raise ValueError("None of the provided marker genes were found in adata.var_names.") + + # Filter marker_genes based on valid_genes + if isinstance(marker_genes, dict): + # Filter genes within each group in the dictionary + marker_genes = {k: [g for g in v if g in valid_genes] for k, v in marker_genes.items()} + # Remove any groups that become empty after filtering + marker_genes = {k: v for k, v in marker_genes.items() if v} + + else: + # If it was a list, just use the list of valid genes + marker_genes = valid_genes + + # Generate the dotplot + print("Generating dotplot...") + sc.pl.dotplot(adata, marker_genes, groupby=cluster_key) + print("Dotplot generated.") + + # Generate the stacked violin plot + print("Generating stacked violin plot...") + sc.pl.stacked_violin(adata, marker_genes, groupby=cluster_key, rotation=90) + print("Stacked violin plot generated.") + +# Note: These plotting functions typically display the plots automatically in interactive environments +# If running as a script, you might need to add: +# import matplotlib.pyplot as plt +# plt.show() \ No newline at end of file