vbilsmith · rando2 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 23, 2026
diff --git a/miRNA_OC_preprocessing_and_nmf.qmd b/miRNA_OC_preprocessing_and_nmf.qmd
@@ -0,0 +1,187 @@
+---
+title: "preprocessing_and_nmf_cluster"
+format: html
+editor: visual
+---
+
+```{r}
+#install.packages(c("tidyverse", "DESeq2", "umap"))
+library(tidyverse)
+
+#if (!requireNamespace("BiocManager", quietly = TRUE)) {
+#  install.packages("BiocManager")
+#}
+#BiocManager::install("Biobase") 
+#BiocManager::install("NMF")
+
+library(Biobase)
+library(umap)
+#library(consensusOV)
+#library(TCGAbiolinks)
+library(dplyr)
+library(reshape2)
+library(ggplot2)
+library(factoextra)
+library(NMF)
+library(GGally)
+```
+
+```{r}
+full_dir <- "/home/hms68/project_pi_rf273/hms68/miRNA_OC/miRNA_jan_26/TCGA-data_10:10:24_RNAandmiRNA"
+txt_files <- list.files(full_dir, pattern = "\\.txt$", recursive = TRUE, full.names = TRUE)
+mirna_files <- txt_files[grepl("mirna", txt_files)]
+```
+
+```{r}
+#load miRNA data
+mirna_list <- list()
+for (file in mirna_files) {
+  temp_data <- read.delim(file, header = TRUE, sep = "\t", comment.char = "#") # confirm that we have the column we need 
+  if (!all(c("miRNA_ID", "read_count") %in% colnames(temp_data))) {
+    cat("Skipping file, missing columns:", file, "\n")
+    next
+  }
+
+  temp_data <- temp_data[, c("miRNA_ID", "read_count")] #select the columns we need
+
+  sample_name <- basename(file) #rename column
+  colnames(temp_data)[2] <- sample_name
+  mirna_list[[sample_name]] <- temp_data #store data
+}
+mirna_expression_combined <- Reduce(function(x, y) merge(x, y, by = "miRNA_ID", all = TRUE), mirna_list) # merge to combined expression matrix
+mirna_expression_combined[is.na(mirna_expression_combined)] <- 0 # any NA to 0, representing missing value
+```
+
+Transposing the data (patients as rows and miRNAs as columns)
+
+```{r}
+#write.csv(mirna_expression_combined, row.names = FALSE)
+miRNA_ids <- mirna_expression_combined$miRNA_ID
+miRNA_data_numeric <- mirna_expression_combined[, -1]
+miRNA_data_t <- as.data.frame(t(miRNA_data_numeric))
+colnames(miRNA_data_t) <- miRNA_ids
+rownames(miRNA_data_t) <- rownames(t(miRNA_data_numeric))
+head(miRNA_data_t)
+```
+
+sum
+
+```{r}
+miRNA_sums <- apply(miRNA_data_t, 2, sum, na.rm = TRUE)
+miRNA_sums_df <- data.frame(miRNA_ID = names(miRNA_sums), Expression_Sum = miRNA_sums)
+miRNA_sums_zero <- miRNA_sums_df |>
+  filter(Expression_Sum < 1) #321 miRNAs are zero, this makes sense because they all have 1881 miRNA (there are nearly 2,800 human miRNAs but I am not sure why we are only using 1881) 
+
+#filter out non-zero miRNA sums
+miRNA_sums_nonzero <- miRNA_sums_df |>
+  filter(Expression_Sum >= 1) #1560 miRNAs are non-zero Mean square error, SD, RMSD, metrics for error 
+```
+
+```{r}
+library(ggplot2)
+#visualize with density plot of the log-transformed sum 
+ggplot(miRNA_sums_nonzero, aes(x = log(Expression_Sum))) +
+  geom_density(fill = "pink", color = "black", alpha = 0.6) +
+  labs(x = "log(Expression_Sum)", y = "Density") + theme_minimal()
+```
+
+```{r}
+#filter miRNAs identified as non-zero in the original data
+nonzero_miRNAs <- miRNA_sums_nonzero$miRNA_ID
+
+filtered_mirna_expression_combined <- mirna_expression_combined %>%
+  filter(miRNA_ID %in% nonzero_miRNAs)
+
+for (patient in colnames(filtered_mirna_expression_combined)[-1]) { # exclude the miRNA_ID column
+  output_file <- paste0(full_dir, "/filtered_", patient, ".txt")
+  write.table(filtered_mirna_expression_combined[, c("miRNA_ID", patient)], file = output_file, sep = "\t", row.names = FALSE, quote = FALSE)
+}
+```
+
+```{r}
+#miRNA_data_t_com_nor is Transposing combined data and converting to df 
+miRNA_ids_com <- filtered_mirna_expression_combined$miRNA_ID
+miRNA_data_numeric_com <- filtered_mirna_expression_combined[, -1]
+#miRNA_data_t_com_nor <- as.data.frame(t(miRNA_data_numeric_com))
+miRNA_data_t_com_nor <- as.data.frame((miRNA_data_numeric_com))
+colnames(miRNA_data_t_com_nor) <- miRNA_ids_com
+rownames(miRNA_data_t_com_nor) <- rownames(t(miRNA_data_numeric_com))
+```
+
+PCA - data explorarion
+
+```{r}
+#dataframe 
+miRNA_data_t_com_nor_df <- as.data.frame(miRNA_data_t_com_nor)
+
+perform_pca <- function(data, scale_data = TRUE) {
+  pca_result <- prcomp(data, center = TRUE, scale. = scale_data)
+  return(pca_result)
+}
+
+miRNA_data_pca4 <- perform_pca(miRNA_data_t_com_nor_df)
+fviz_eig(miRNA_data_pca4, addlabels = TRUE, ylim = c(0, 50),
+         main = "") 
+```
+
+```{r}
+#create a data frame of the first 5 principal components
+pca_pairwise_data <- as.data.frame(miRNA_data_pca4$x[, 1:5])
+colnames(pca_pairwise_data) <- paste0("PC", 1:5)
+
+#pairwise plot of the first 5 principal components
+ggpairs(pca_pairwise_data,
+        title = "Pairwise Plots of Principal Components 1-5")
+```
+
+```{r}
+explained_variance <- cumsum(miRNA_data_pca4$sdev^2 / sum(miRNA_data_pca4$sdev^2)) * 100 #80% explained with close to 200 PCs
+```
+
+Clustering - (using k-means)
+
+```{r}
+set.seed(123)
+number_of_clusters <- 4  #assume there are four clusters, what would they look like using miRNA?
+kmeans_result <- kmeans(miRNA_data_t_com_nor, centers = number_of_clusters, nstart = 25)
+#kmeans_result <- kmeans(filtered_miRNA_data_by_hybrid, centers = number_of_clusters, nstart = 25)
+
+#visualize the clusters
+pca_data <- as.data.frame(miRNA_data_pca4$x) 
+pca_data$Cluster <- as.factor(kmeans_result$cluster)
+
+ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster)) + geom_point(size = 3, alpha = 0.7) +
+  labs( x = "Principal Component 1", y = "Principal Component 2") + theme_minimal() + theme(legend.position = "right")
+```
+
+potential another approach for filtering - using hybrid mean and **high mean and high variability first**
+
+```{r}
+
+```
+
+Clustering - using NMF
+
+```{r}
+set.seed(42) 
+k_range <- 2:8 #assuming the clusters will be in the rangg 2 - 8
+#nmf_result <- nmf(miRNA_data_t_com_nor, rank = k_range, method = "brunet", nrun = 50, .opt = "v-p")
+nmf_result <- nmf(miRNA_data_t_com_nor, rank = 4, method = "brunet", nrun = 1, .opt = "v-p")
+
+#plot to visualuze 
+consensusmap(nmf_result, main = "Consensus NMF Clustering")a
+
+```
+
+We can also use other clustering methods - including but not limited to Gaussian Mixture Models (GMM) which unlike k-means uses soft assignments.
+
+```{r}
+#get the assigned cluster members 
+#get cluster assignments for best rank
+#best_model <- fit(nmf_result, rank = 4)  
+best_model <- fit(nmf_result) 
+clusters <- apply(coef(best_model), 2, which.max)
+
+table(clusters) #view assignments
+split(names(clusters), clusters) #get members of each cluster
+```
diff --git a/mirna_subtypes.Rproj b/mirna_subtypes.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX