diff --git a/.gitignore b/.gitignore index 0a19790..56fe612 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,5 @@ cython_debug/ # PyPI configuration file .pypirc + + diff --git a/src/modules/dotplot.py b/src/modules/dotplot.py new file mode 100644 index 0000000..eeab89a --- /dev/null +++ b/src/modules/dotplot.py @@ -0,0 +1,23 @@ +import scanpy as sc + +def visualize_marker_genes(adata, marker_genes, cluster_key='leiden'): + """ + Visualizes marker gene expression across annotated cell clusters. + + Parameters: + adata (AnnData): Annotated data matrix containing single-cell RNA-seq data. + marker_genes (list or dict): Marker genes to visualize. + cluster_key (str): Key in `adata.obs` to group cells by (default: 'leiden'). + + Returns: + None: Displays dotplot and stacked violin plot of marker gene expression. + """ + + # Dotplot to show average expression and percent of expressing cells + sc.pl.dotplot(adata, marker_genes, groupby=cluster_key) + + # Stacked violin plot to show gene expression distribution per cluster + sc.pl.stacked_violin(adata, marker_genes, groupby=cluster_key, rotation=90) + + + diff --git a/src/modules/dotplot_demo.py b/src/modules/dotplot_demo.py new file mode 100644 index 0000000..4351ed5 --- /dev/null +++ b/src/modules/dotplot_demo.py @@ -0,0 +1,72 @@ +import scanpy as sc +import warnings +import pandas as pd + +def visualize_marker_genes(adata, marker_genes, cluster_key='leiden'): + """ + Visualizes marker gene expression across annotated cell clusters. + + Parameters: + adata (AnnData): Annotated data matrix containing single-cell RNA-seq data. + marker_genes (list or dict): Marker genes to visualize. Can be a list or a dict (for grouped markers). + cluster_key (str): Key in `adata.obs` to group cells by (default: 'leiden'). + + Returns: + None: Displays dotplot and stacked violin plot of marker gene expression. + """ + # Check if cluster_key exists in adata.obs columns + if cluster_key not in adata.obs.columns: + raise ValueError(f"'{cluster_key}' column not found in adata.obs. Available columns: {list(adata.obs.columns)}") + + # Validate marker genes depending on type (list or dict) + if isinstance(marker_genes, dict): + # Flatten the list of genes from the dictionary values for validation + flat_genes = [gene for genes in marker_genes.values() for gene in genes] + elif isinstance(marker_genes, list): + flat_genes = marker_genes + else: + # Raise an error if marker_genes is not a list or dictionary + raise TypeError("marker_genes must be a list or a dictionary of gene lists.") + + # Check gene existence in adata.var_names + valid_genes = [] + missing_genes = [] + for gene in flat_genes: + if gene in adata.var_names: + valid_genes.append(gene) + else: + missing_genes.append(gene) + + # Warn the user about any genes that were not found + if missing_genes: + warnings.warn(f"The following genes are not found in adata.var_names and will be ignored: {missing_genes}") + + # If no valid genes are left after checking, raise an error + if not valid_genes: + raise ValueError("None of the provided marker genes were found in adata.var_names.") + + # Filter marker_genes based on valid_genes + if isinstance(marker_genes, dict): + # Filter genes within each group in the dictionary + marker_genes = {k: [g for g in v if g in valid_genes] for k, v in marker_genes.items()} + # Remove any groups that become empty after filtering + marker_genes = {k: v for k, v in marker_genes.items() if v} + + else: + # If it was a list, just use the list of valid genes + marker_genes = valid_genes + + # Generate the dotplot + print("Generating dotplot...") + sc.pl.dotplot(adata, marker_genes, groupby=cluster_key) + print("Dotplot generated.") + + # Generate the stacked violin plot + print("Generating stacked violin plot...") + sc.pl.stacked_violin(adata, marker_genes, groupby=cluster_key, rotation=90) + print("Stacked violin plot generated.") + +# Note: These plotting functions typically display the plots automatically in interactive environments +# If running as a script, you might need to add: +# import matplotlib.pyplot as plt +# plt.show() \ No newline at end of file diff --git a/src/modules/get_top_ranked_genes.py b/src/modules/get_top_ranked_genes.py new file mode 100644 index 0000000..6f7cf8e --- /dev/null +++ b/src/modules/get_top_ranked_genes.py @@ -0,0 +1,38 @@ +import pandas as pd + +def extract_top_genes(adata, n_top=5): + """ + Extracts top-ranked genes and p-values from adata.uns['rank_genes_groups']. + + Parameters: + - adata: AnnData object after rank_genes_groups has been run. + - n_top: Number of top genes to return (default is 5). + + Returns: + - A tuple of two DataFrames: + 1. Top gene names per group. + 2. A combined DataFrame of top genes and p-values. + """ + + if 'rank_genes_groups' not in adata.uns: + raise ValueError("No 'rank_genes_groups' results found in adata.uns. Did you run `sc.tl.rank_genes_groups`?") + + result = adata.uns['rank_genes_groups'] + groups = result['names'].dtype.names + + top_gene_names = pd.DataFrame(result['names']).head(n_top) + + combined_df = pd.DataFrame({ + f"{group}_n": result['names'][group][:n_top] + for group in groups + }) + combined_df_pvals = pd.DataFrame({ + f"{group}_p": result['pvals'][group][:n_top] + for group in groups + }) + + combined = pd.concat([combined_df, combined_df_pvals], axis=1) + + return top_gene_names, combined + + diff --git a/src/modules/marked_genes.py b/src/modules/marked_genes.py new file mode 100644 index 0000000..1b36dd2 --- /dev/null +++ b/src/modules/marked_genes.py @@ -0,0 +1,43 @@ +import scanpy as sc +import pandas as pd + +# Load the dataset +adata = sc.datasets.pbmc3k() + +# Log-transform the raw count data (required for differential expression analysis) +sc.pp.log1p(adata) + +# Apply PCA and calculate neighbors +sc.pp.pca(adata, svd_solver='arpack') # Apply PCA +sc.pp.neighbors(adata, n_pcs=40) # Calculate neighbors after PCA + +# Run Leiden algorithm (with flavor='igraph' for future compatibility) +sc.tl.leiden(adata, resolution=1.0, flavor="igraph", directed=False, n_iterations=2) + +# Run differential expression analysis to rank genes +sc.tl.rank_genes_groups(adata, groupby='leiden', method='t-test') + +# Save the adata object with the rank_genes_groups results +adata.write("ranked_genes_results.h5ad") # Save the results to a file + +# Set verbosity level +sc.settings.verbosity = 2 + +# Define the function to get marker genes +def get_marker_genes(adata, results_file): + # Read the results file + adata = sc.read(results_file) + + # Check if 'rank_genes_groups' exists in adata.uns + if 'rank_genes_groups' in adata.uns: + # Extract the top 5 marker genes as a DataFrame + df = pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5) + return df + else: + print("Error: 'rank_genes_groups' not found in adata.uns.") + return None + +# Run the function to get marker genes +marker_genes = get_marker_genes(adata, results_file="ranked_genes_results.h5ad") +if marker_genes is not None: + print(marker_genes) diff --git a/src/modules/marked_genes_demo.py b/src/modules/marked_genes_demo.py new file mode 100644 index 0000000..e69de29 diff --git a/src/modules/rename_categories.py b/src/modules/rename_categories.py new file mode 100644 index 0000000..f3f1385 --- /dev/null +++ b/src/modules/rename_categories.py @@ -0,0 +1,25 @@ +def rename_clusters(adata, cluster_algo: str, new_cluster_names: list): + """ + Renames cluster labels in adata.obs using a chosen clustering algorithm key and new names. + + Parameters: + adata (AnnData): The annotated data matrix. + cluster_algo (str): The key in `adata.obs` representing clustering (e.g., 'leiden', 'louvain'). + new_cluster_names (list): A list of new cluster names. + + Returns: + None: Updates adata in place. + """ + + if cluster_algo not in adata.obs: + raise ValueError(f"'{cluster_algo}' not found in adata.obs. Make sure clustering has been run.") + + n_clusters = len(adata.obs[cluster_algo].cat.categories) + + if len(new_cluster_names) != n_clusters: + raise ValueError(f"Number of new names ({len(new_cluster_names)}) does not match number of clusters ({n_clusters}).") + + adata.rename_categories(cluster_algo, new_cluster_names) + print(f"Clusters renamed using '{cluster_algo}'.") + + diff --git a/src/modules/rename_cluster_demo.py b/src/modules/rename_cluster_demo.py new file mode 100644 index 0000000..07f22cf --- /dev/null +++ b/src/modules/rename_cluster_demo.py @@ -0,0 +1,69 @@ +import numpy as np +import pandas as pd +from anndata import AnnData +import scanpy as sc + +# 1. Create dummy data +X = np.random.rand(100, 10) # 100 cells, 10 genes +var_names = [f"Gene{i}" for i in range(10)] +obs = pd.DataFrame({ + # Create a 'leiden' column with categorical values ('0', '1', '2') + 'leiden': pd.Categorical(np.random.choice(['0', '1', '2'], size=100)) +}) +# Create the AnnData object +adata = AnnData(X=X, obs=obs) +adata.var_names = var_names # Set the gene names + +print("Dummy AnnData object created with 'leiden' clusters.") +print(f" Original cluster categories:\n{adata.obs['leiden'].cat.categories}") + + +# 2. Define the new cluster names +# Ensure the number of new names matches the number of categories in 'leiden' +new_cluster_names = ['CD4 T cells', 'Monocytes', 'B cells'] +print(f"\nDefined new cluster names: {new_cluster_names}") + + +# 3. Define the rename_clusters function +def rename_clusters(adata, cluster_algo: str, new_cluster_names: list): + """ + Renames cluster labels in adata.obs using a chosen clustering algorithm key and new names. + + Parameters: + adata (AnnData): The annotated data matrix. + cluster_algo (str): The key in `adata.obs` representing clustering (e.g., 'leiden', 'louvain'). + new_cluster_names (list): A list of new cluster names. + + Returns: + None: Updates adata in place. + """ + print(f"Attempting to rename clusters for column '{cluster_algo}'...") + + # Check if the cluster key exists in adata.obs + if cluster_algo not in adata.obs: + raise ValueError(f"'{cluster_algo}' not found in adata.obs. Make sure clustering has been performed.") + + # Check if the column is categorical, as rename_categories works on categorical columns + if not pd.api.types.is_categorical_dtype(adata.obs[cluster_algo]): + raise TypeError(f"'{cluster_algo}' column must be categorical.") + + # Get the number of existing clusters (categories) + n_clusters = len(adata.obs[cluster_algo].cat.categories) + + # Check if the number of new names matches the number of clusters + if len(new_cluster_names) != n_clusters: + raise ValueError(f"Expected {n_clusters} new names for '{cluster_algo}', but got {len(new_cluster_names)}.") + + # Perform the renaming + adata.rename_categories(cluster_algo, new_cluster_names) + print(f" Cluster names updated successfully in '{cluster_algo}'.") + + +# 4. Apply the function +print("\nApplying rename_clusters function...") +rename_clusters(adata, cluster_algo='leiden', new_cluster_names=new_cluster_names) + + +# 5. See the results +print("\n Updated cluster names:") +print(adata.obs['leiden'].cat.categories) \ No newline at end of file diff --git a/src/modules/topranked_demo.py b/src/modules/topranked_demo.py new file mode 100644 index 0000000..3f0b76d --- /dev/null +++ b/src/modules/topranked_demo.py @@ -0,0 +1,50 @@ +import pandas as pd + +def get_top_ranked_genes(adata, top_n=5): + """ + Retrieves the top `top_n` marker genes for each cluster from + `adata.uns['rank_genes_groups']`. + + Parameters: + - adata (AnnData): Annotated data matrix after differential expression analysis (`sc.tl.rank_genes_groups`). + - top_n (int): Number of top-ranked genes to retrieve for each cluster. + + Returns: + - pd.DataFrame: Long-format DataFrame with 'cluster', 'rank', 'gene_name', 'pval', and 'score'. + """ + # Check if 'rank_genes_groups' exists in adata.uns + if 'rank_genes_groups' not in adata.uns: + raise ValueError("No 'rank_genes_groups' found in adata.uns. Run `sc.tl.rank_genes_groups()` first.") + + # Validate the top_n parameter + if not isinstance(top_n, int) or top_n <= 0: + raise ValueError("`top_n` must be a positive integer.") + + # Get the results dictionary + result = adata.uns['rank_genes_groups'] + # Extract cluster names from the result structure + groups = result['names'].dtype.names + + data = [] # List to store data for the DataFrame + # Iterate through each cluster + for cluster in groups: + # Extract top_n gene names for the current cluster + gene_names = result['names'][cluster][:top_n] + # Extract top_n p-values, using .get to handle potential absence of 'pvals' + pvals = result.get('pvals', {}).get(cluster, [None]*top_n) + # Extract top_n scores, using .get to handle potential absence of 'scores' + scores = result.get('scores', {}).get(cluster, [None]*top_n) + + # Iterate through the top genes and their associated values for the cluster + for rank, (gene, pval, score) in enumerate(zip(gene_names, pvals, scores), 1): + # Append a dictionary for each gene to the data list + data.append({ + 'cluster': cluster, + 'rank': rank, + 'gene_name': gene, + 'pval': pval, + 'score': score + }) + + # Create and return a pandas DataFrame from the collected data + return pd.DataFrame(data) \ No newline at end of file