IEEE-Ege · happymealinthebuilding · Apr 11, 2025 · Apr 20, 2025 · Apr 20, 2025 · Apr 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,5 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+
+
diff --git a/src/modules/dotplot.py b/src/modules/dotplot.py
@@ -0,0 +1,23 @@
+import scanpy as sc
+
+def visualize_marker_genes(adata, marker_genes, cluster_key='leiden'):
+    """
+    Visualizes marker gene expression across annotated cell clusters.
+
+    Parameters:
+        adata (AnnData): Annotated data matrix containing single-cell RNA-seq data.
+        marker_genes (list or dict): Marker genes to visualize.
+        cluster_key (str): Key in `adata.obs` to group cells by (default: 'leiden').
+
+    Returns:
+        None: Displays dotplot and stacked violin plot of marker gene expression.
+    """
+
+    # Dotplot to show average expression and percent of expressing cells
+    sc.pl.dotplot(adata, marker_genes, groupby=cluster_key)
+
+    # Stacked violin plot to show gene expression distribution per cluster
+    sc.pl.stacked_violin(adata, marker_genes, groupby=cluster_key, rotation=90)
+
+
+
diff --git a/src/modules/dotplot_demo.py b/src/modules/dotplot_demo.py
@@ -0,0 +1,72 @@
+import scanpy as sc
+import warnings
+import pandas as pd
+
+def visualize_marker_genes(adata, marker_genes, cluster_key='leiden'):
+    """
+    Visualizes marker gene expression across annotated cell clusters.
+
+    Parameters:
+        adata (AnnData): Annotated data matrix containing single-cell RNA-seq data.
+        marker_genes (list or dict): Marker genes to visualize. Can be a list or a dict (for grouped markers).
+        cluster_key (str): Key in `adata.obs` to group cells by (default: 'leiden').
+
+    Returns:
+        None: Displays dotplot and stacked violin plot of marker gene expression.
+    """
+    # Check if cluster_key exists in adata.obs columns
+    if cluster_key not in adata.obs.columns:
+        raise ValueError(f"'{cluster_key}' column not found in adata.obs. Available columns: {list(adata.obs.columns)}")
+
+    # Validate marker genes depending on type (list or dict)
+    if isinstance(marker_genes, dict):
+        # Flatten the list of genes from the dictionary values for validation
+        flat_genes = [gene for genes in marker_genes.values() for gene in genes]
+    elif isinstance(marker_genes, list):
+        flat_genes = marker_genes
+    else:
+        # Raise an error if marker_genes is not a list or dictionary
+        raise TypeError("marker_genes must be a list or a dictionary of gene lists.")
+
+    # Check gene existence in adata.var_names
+    valid_genes = []
+    missing_genes = []
+    for gene in flat_genes:
+        if gene in adata.var_names:
+            valid_genes.append(gene)
+        else:
+            missing_genes.append(gene)
+
+    # Warn the user about any genes that were not found
+    if missing_genes:
+        warnings.warn(f"The following genes are not found in adata.var_names and will be ignored: {missing_genes}")
+
+    # If no valid genes are left after checking, raise an error
+    if not valid_genes:
+        raise ValueError("None of the provided marker genes were found in adata.var_names.")
+
+    # Filter marker_genes based on valid_genes
+    if isinstance(marker_genes, dict):
+        # Filter genes within each group in the dictionary
+        marker_genes = {k: [g for g in v if g in valid_genes] for k, v in marker_genes.items()}
+        # Remove any groups that become empty after filtering
+        marker_genes = {k: v for k, v in marker_genes.items() if v}
+
+    else:
+        # If it was a list, just use the list of valid genes
+        marker_genes = valid_genes
+
+    # Generate the dotplot
+    print("Generating dotplot...")
+    sc.pl.dotplot(adata, marker_genes, groupby=cluster_key)
+    print("Dotplot generated.")
+
+    # Generate the stacked violin plot
+    print("Generating stacked violin plot...")
+    sc.pl.stacked_violin(adata, marker_genes, groupby=cluster_key, rotation=90)
+    print("Stacked violin plot generated.")
+
+# Note: These plotting functions typically display the plots automatically in interactive environments
+# If running as a script, you might need to add:
+# import matplotlib.pyplot as plt
+# plt.show()
diff --git a/src/modules/get_top_ranked_genes.py b/src/modules/get_top_ranked_genes.py
@@ -0,0 +1,38 @@
+import pandas as pd
+
+def extract_top_genes(adata, n_top=5):
+    """
+    Extracts top-ranked genes and p-values from adata.uns['rank_genes_groups'].
+
+    Parameters:
+    - adata: AnnData object after rank_genes_groups has been run.
+    - n_top: Number of top genes to return (default is 5).
+
+    Returns:
+    - A tuple of two DataFrames:
+        1. Top gene names per group.
+        2. A combined DataFrame of top genes and p-values.
+    """
+
+    if 'rank_genes_groups' not in adata.uns:
+        raise ValueError("No 'rank_genes_groups' results found in adata.uns. Did you run `sc.tl.rank_genes_groups`?")
+
+    result = adata.uns['rank_genes_groups']
+    groups = result['names'].dtype.names
+
+    top_gene_names = pd.DataFrame(result['names']).head(n_top)
+
+    combined_df = pd.DataFrame({
+        f"{group}_n": result['names'][group][:n_top]
+        for group in groups
+    })
+    combined_df_pvals = pd.DataFrame({
+        f"{group}_p": result['pvals'][group][:n_top]
+        for group in groups
+    })
+
+    combined = pd.concat([combined_df, combined_df_pvals], axis=1)
+
+    return top_gene_names, combined
+
+
diff --git a/src/modules/marked_genes.py b/src/modules/marked_genes.py
@@ -0,0 +1,43 @@
+import scanpy as sc
+import pandas as pd
+
+# Load the dataset
+adata = sc.datasets.pbmc3k()
+
+# Log-transform the raw count data (required for differential expression analysis)
+sc.pp.log1p(adata)
+
+# Apply PCA and calculate neighbors
+sc.pp.pca(adata, svd_solver='arpack')  # Apply PCA
+sc.pp.neighbors(adata, n_pcs=40)  # Calculate neighbors after PCA
+
+# Run Leiden algorithm (with flavor='igraph' for future compatibility)
+sc.tl.leiden(adata, resolution=1.0, flavor="igraph", directed=False, n_iterations=2)
+
+# Run differential expression analysis to rank genes
+sc.tl.rank_genes_groups(adata, groupby='leiden', method='t-test')
+
+# Save the adata object with the rank_genes_groups results
+adata.write("ranked_genes_results.h5ad")  # Save the results to a file
+
+# Set verbosity level
+sc.settings.verbosity = 2
+
+# Define the function to get marker genes
+def get_marker_genes(adata, results_file):
+    # Read the results file
+    adata = sc.read(results_file)
+
+    # Check if 'rank_genes_groups' exists in adata.uns
+    if 'rank_genes_groups' in adata.uns:
+        # Extract the top 5 marker genes as a DataFrame
+        df = pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)
+        return df
+    else:
+        print("Error: 'rank_genes_groups' not found in adata.uns.")
+        return None
+
+# Run the function to get marker genes
+marker_genes = get_marker_genes(adata, results_file="ranked_genes_results.h5ad")
+if marker_genes is not None:
+    print(marker_genes)
diff --git a/src/modules/marked_genes_demo.py b/src/modules/marked_genes_demo.py
diff --git a/src/modules/rename_categories.py b/src/modules/rename_categories.py
@@ -0,0 +1,25 @@
+def rename_clusters(adata, cluster_algo: str, new_cluster_names: list):
+    """
+    Renames cluster labels in adata.obs using a chosen clustering algorithm key and new names.
+
+    Parameters:
+        adata (AnnData): The annotated data matrix.
+        cluster_algo (str): The key in `adata.obs` representing clustering (e.g., 'leiden', 'louvain').
+        new_cluster_names (list): A list of new cluster names.
+
+    Returns:
+        None: Updates adata in place.
+    """
+
+    if cluster_algo not in adata.obs:
+        raise ValueError(f"'{cluster_algo}' not found in adata.obs. Make sure clustering has been run.")
+
+    n_clusters = len(adata.obs[cluster_algo].cat.categories)
+
+    if len(new_cluster_names) != n_clusters:
+        raise ValueError(f"Number of new names ({len(new_cluster_names)}) does not match number of clusters ({n_clusters}).")
+
+    adata.rename_categories(cluster_algo, new_cluster_names)
+    print(f"Clusters renamed using '{cluster_algo}'.")
+
+
diff --git a/src/modules/rename_cluster_demo.py b/src/modules/rename_cluster_demo.py
@@ -0,0 +1,69 @@
+import numpy as np 
+import pandas as pd
+from anndata import AnnData
+import scanpy as sc
+
+# 1. Create dummy data
+X = np.random.rand(100, 10)  # 100 cells, 10 genes
+var_names = [f"Gene{i}" for i in range(10)]
+obs = pd.DataFrame({
+    # Create a 'leiden' column with categorical values ('0', '1', '2')
+    'leiden': pd.Categorical(np.random.choice(['0', '1', '2'], size=100))
+})
+# Create the AnnData object
+adata = AnnData(X=X, obs=obs)
+adata.var_names = var_names # Set the gene names
+
+print("Dummy AnnData object created with 'leiden' clusters.")
+print(f" Original cluster categories:\n{adata.obs['leiden'].cat.categories}")
+
+
+# 2. Define the new cluster names
+# Ensure the number of new names matches the number of categories in 'leiden'
+new_cluster_names = ['CD4 T cells', 'Monocytes', 'B cells']
+print(f"\nDefined new cluster names: {new_cluster_names}")
+
+
+# 3. Define the rename_clusters function
+def rename_clusters(adata, cluster_algo: str, new_cluster_names: list):
+    """
+    Renames cluster labels in adata.obs using a chosen clustering algorithm key and new names.
+
+    Parameters:
+        adata (AnnData): The annotated data matrix.
+        cluster_algo (str): The key in `adata.obs` representing clustering (e.g., 'leiden', 'louvain').
+        new_cluster_names (list): A list of new cluster names.
+
+    Returns:
+        None: Updates adata in place.
+    """
+    print(f"Attempting to rename clusters for column '{cluster_algo}'...")
+
+    # Check if the cluster key exists in adata.obs
+    if cluster_algo not in adata.obs:
+        raise ValueError(f"'{cluster_algo}' not found in adata.obs. Make sure clustering has been performed.")
+
+    # Check if the column is categorical, as rename_categories works on categorical columns
+    if not pd.api.types.is_categorical_dtype(adata.obs[cluster_algo]):
+        raise TypeError(f"'{cluster_algo}' column must be categorical.")
+
+    # Get the number of existing clusters (categories)
+    n_clusters = len(adata.obs[cluster_algo].cat.categories)
+
+    # Check if the number of new names matches the number of clusters
+    if len(new_cluster_names) != n_clusters:
+        raise ValueError(f"Expected {n_clusters} new names for '{cluster_algo}', but got {len(new_cluster_names)}.")
+
+    # Perform the renaming
+    adata.rename_categories(cluster_algo, new_cluster_names)
+    print(f" Cluster names updated successfully in '{cluster_algo}'.")
+
+
+# 4. Apply the function
+print("\nApplying rename_clusters function...")
+rename_clusters(adata, cluster_algo='leiden', new_cluster_names=new_cluster_names)
+
+
+# 5. See the results
+print("\n Updated cluster names:")
+print(adata.obs['leiden'].cat.categories)
diff --git a/src/modules/topranked_demo.py b/src/modules/topranked_demo.py
@@ -0,0 +1,50 @@
+import pandas as pd
+
+def get_top_ranked_genes(adata, top_n=5):
+    """
+    Retrieves the top `top_n` marker genes for each cluster from
+    `adata.uns['rank_genes_groups']`.
+
+    Parameters:
+    - adata (AnnData): Annotated data matrix after differential expression analysis (`sc.tl.rank_genes_groups`).
+    - top_n (int): Number of top-ranked genes to retrieve for each cluster.
+
+    Returns:
+    - pd.DataFrame: Long-format DataFrame with 'cluster', 'rank', 'gene_name', 'pval', and 'score'.
+    """
+    # Check if 'rank_genes_groups' exists in adata.uns
+    if 'rank_genes_groups' not in adata.uns:
+        raise ValueError("No 'rank_genes_groups' found in adata.uns. Run `sc.tl.rank_genes_groups()` first.")
+
+    # Validate the top_n parameter
+    if not isinstance(top_n, int) or top_n <= 0:
+        raise ValueError("`top_n` must be a positive integer.")
+
+    # Get the results dictionary
+    result = adata.uns['rank_genes_groups']
+    # Extract cluster names from the result structure
+    groups = result['names'].dtype.names
+
+    data = [] # List to store data for the DataFrame
+    # Iterate through each cluster
+    for cluster in groups:
+        # Extract top_n gene names for the current cluster
+        gene_names = result['names'][cluster][:top_n]
+        # Extract top_n p-values, using .get to handle potential absence of 'pvals'
+        pvals = result.get('pvals', {}).get(cluster, [None]*top_n)
+        # Extract top_n scores, using .get to handle potential absence of 'scores'
+        scores = result.get('scores', {}).get(cluster, [None]*top_n)
+
+        # Iterate through the top genes and their associated values for the cluster
+        for rank, (gene, pval, score) in enumerate(zip(gene_names, pvals, scores), 1):
+            # Append a dictionary for each gene to the data list
+            data.append({
+                'cluster': cluster,
+                'rank': rank,
+                'gene_name': gene,
+                'pval': pval,
+                'score': score
+            })
+
+    # Create and return a pandas DataFrame from the collected data
+    return pd.DataFrame(data)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -172,3 +172,5 @@ cython_debug/

		# PyPI configuration file
		.pypirc