Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 187 additions & 0 deletions miRNA_OC_preprocessing_and_nmf.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
---
title: "preprocessing_and_nmf_cluster"
format: html
editor: visual
---

```{r}
#install.packages(c("tidyverse", "DESeq2", "umap"))
library(tidyverse)

#if (!requireNamespace("BiocManager", quietly = TRUE)) {
# install.packages("BiocManager")
#}
#BiocManager::install("Biobase")
#BiocManager::install("NMF")

library(Biobase)
library(umap)
#library(consensusOV)
#library(TCGAbiolinks)
library(dplyr)
library(reshape2)
library(ggplot2)
library(factoextra)
library(NMF)
library(GGally)
```

```{r}
full_dir <- "/home/hms68/project_pi_rf273/hms68/miRNA_OC/miRNA_jan_26/TCGA-data_10:10:24_RNAandmiRNA"
txt_files <- list.files(full_dir, pattern = "\\.txt$", recursive = TRUE, full.names = TRUE)
mirna_files <- txt_files[grepl("mirna", txt_files)]
```

```{r}
#load miRNA data
mirna_list <- list()
for (file in mirna_files) {
temp_data <- read.delim(file, header = TRUE, sep = "\t", comment.char = "#") # confirm that we have the column we need
if (!all(c("miRNA_ID", "read_count") %in% colnames(temp_data))) {
cat("Skipping file, missing columns:", file, "\n")
next
}

temp_data <- temp_data[, c("miRNA_ID", "read_count")] #select the columns we need

sample_name <- basename(file) #rename column
colnames(temp_data)[2] <- sample_name
mirna_list[[sample_name]] <- temp_data #store data
}
mirna_expression_combined <- Reduce(function(x, y) merge(x, y, by = "miRNA_ID", all = TRUE), mirna_list) # merge to combined expression matrix
mirna_expression_combined[is.na(mirna_expression_combined)] <- 0 # any NA to 0, representing missing value
```

Transposing the data (patients as rows and miRNAs as columns)

```{r}
#write.csv(mirna_expression_combined, row.names = FALSE)
miRNA_ids <- mirna_expression_combined$miRNA_ID
miRNA_data_numeric <- mirna_expression_combined[, -1]
miRNA_data_t <- as.data.frame(t(miRNA_data_numeric))
colnames(miRNA_data_t) <- miRNA_ids
rownames(miRNA_data_t) <- rownames(t(miRNA_data_numeric))
head(miRNA_data_t)
```

sum

```{r}
miRNA_sums <- apply(miRNA_data_t, 2, sum, na.rm = TRUE)
miRNA_sums_df <- data.frame(miRNA_ID = names(miRNA_sums), Expression_Sum = miRNA_sums)
miRNA_sums_zero <- miRNA_sums_df |>
filter(Expression_Sum < 1) #321 miRNAs are zero, this makes sense because they all have 1881 miRNA (there are nearly 2,800 human miRNAs but I am not sure why we are only using 1881)

#filter out non-zero miRNA sums
miRNA_sums_nonzero <- miRNA_sums_df |>
filter(Expression_Sum >= 1) #1560 miRNAs are non-zero Mean square error, SD, RMSD, metrics for error
```

```{r}
library(ggplot2)
#visualize with density plot of the log-transformed sum
ggplot(miRNA_sums_nonzero, aes(x = log(Expression_Sum))) +
geom_density(fill = "pink", color = "black", alpha = 0.6) +
labs(x = "log(Expression_Sum)", y = "Density") + theme_minimal()
```

```{r}
#filter miRNAs identified as non-zero in the original data
nonzero_miRNAs <- miRNA_sums_nonzero$miRNA_ID

filtered_mirna_expression_combined <- mirna_expression_combined %>%
filter(miRNA_ID %in% nonzero_miRNAs)

for (patient in colnames(filtered_mirna_expression_combined)[-1]) { # exclude the miRNA_ID column
output_file <- paste0(full_dir, "/filtered_", patient, ".txt")
write.table(filtered_mirna_expression_combined[, c("miRNA_ID", patient)], file = output_file, sep = "\t", row.names = FALSE, quote = FALSE)
}
```

```{r}
#miRNA_data_t_com_nor is Transposing combined data and converting to df
miRNA_ids_com <- filtered_mirna_expression_combined$miRNA_ID
miRNA_data_numeric_com <- filtered_mirna_expression_combined[, -1]
#miRNA_data_t_com_nor <- as.data.frame(t(miRNA_data_numeric_com))
miRNA_data_t_com_nor <- as.data.frame((miRNA_data_numeric_com))
colnames(miRNA_data_t_com_nor) <- miRNA_ids_com
rownames(miRNA_data_t_com_nor) <- rownames(t(miRNA_data_numeric_com))
```

PCA - data explorarion

```{r}
#dataframe
miRNA_data_t_com_nor_df <- as.data.frame(miRNA_data_t_com_nor)

perform_pca <- function(data, scale_data = TRUE) {
pca_result <- prcomp(data, center = TRUE, scale. = scale_data)
return(pca_result)
}

miRNA_data_pca4 <- perform_pca(miRNA_data_t_com_nor_df)
fviz_eig(miRNA_data_pca4, addlabels = TRUE, ylim = c(0, 50),
main = "")
```

```{r}
#create a data frame of the first 5 principal components
pca_pairwise_data <- as.data.frame(miRNA_data_pca4$x[, 1:5])
colnames(pca_pairwise_data) <- paste0("PC", 1:5)

#pairwise plot of the first 5 principal components
ggpairs(pca_pairwise_data,
title = "Pairwise Plots of Principal Components 1-5")
```

```{r}
explained_variance <- cumsum(miRNA_data_pca4$sdev^2 / sum(miRNA_data_pca4$sdev^2)) * 100 #80% explained with close to 200 PCs
```

Clustering - (using k-means)

```{r}
set.seed(123)
number_of_clusters <- 4 #assume there are four clusters, what would they look like using miRNA?
kmeans_result <- kmeans(miRNA_data_t_com_nor, centers = number_of_clusters, nstart = 25)
#kmeans_result <- kmeans(filtered_miRNA_data_by_hybrid, centers = number_of_clusters, nstart = 25)

#visualize the clusters
pca_data <- as.data.frame(miRNA_data_pca4$x)
pca_data$Cluster <- as.factor(kmeans_result$cluster)

ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster)) + geom_point(size = 3, alpha = 0.7) +
labs( x = "Principal Component 1", y = "Principal Component 2") + theme_minimal() + theme(legend.position = "right")
```

potential another approach for filtering - using hybrid mean and **high mean and high variability first**

```{r}

```

Clustering - using NMF

```{r}
set.seed(42)
k_range <- 2:8 #assuming the clusters will be in the rangg 2 - 8
#nmf_result <- nmf(miRNA_data_t_com_nor, rank = k_range, method = "brunet", nrun = 50, .opt = "v-p")
nmf_result <- nmf(miRNA_data_t_com_nor, rank = 4, method = "brunet", nrun = 1, .opt = "v-p")

#plot to visualuze
consensusmap(nmf_result, main = "Consensus NMF Clustering")a

```

We can also use other clustering methods - including but not limited to Gaussian Mixture Models (GMM) which unlike k-means uses soft assignments.

```{r}
#get the assigned cluster members
#get cluster assignments for best rank
#best_model <- fit(nmf_result, rank = 4)
best_model <- fit(nmf_result)
clusters <- apply(coef(best_model), 2, which.max)

table(clusters) #view assignments
split(names(clusters), clusters) #get members of each cluster
```
13 changes: 13 additions & 0 deletions mirna_subtypes.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX
Loading