-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessData.R
More file actions
53 lines (43 loc) · 1.76 KB
/
preprocessData.R
File metadata and controls
53 lines (43 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# PREPROCESS DATA
# correct for batch effects and filter
# read in annotation data
# last modified: January 3rd, 2018
# load packages
library(Biobase)
library(genefilter)
library(sva)
library(AnnotationDbi)
library(dplyr)
# load in eSet
load("prevailAug_eset.rda")
prevail.eset <- prevailJul3.eset[,1:188] # last 21 samples are healthy controls
# get patient codes
groupDesignation <- read.csv("PREVAIL_randomization_20170303.csv")
IDs <- read.csv("PREVAIL_patient_20170303.csv")
patientIDs <- prevail.eset$patient_id
seqID_indices <- match(patientIDs,IDs$patient_id)
patientSeqIDs <- as.character(IDs[seqID_indices,"seq_id"])
patientCodeIndices <- match(patientSeqIDs,groupDesignation$seq_id)
patientCodes <- as.character(groupDesignation[patientCodeIndices,"code"])
# correct for batch effects
phenoData <- pData(prevail.eset)
batch <- phenoData$Batch
modcombat <- model.matrix(~1, data=phenoData) # no adjustment variables, just fit an intercept term
exprsData <- exprs(prevail.eset)
correctedExprs <- ComBat(dat=exprsData, batch=batch, mod=modcombat, par.prior=TRUE, prior.plots=FALSE)
# create new corrected expression set
corrected.eset <- prevail.eset
exprs(corrected.eset) <- correctedExprs
# take the top 50% most variable probes
# use IQR as variance func - robust to outliers
filtered.eset <- varFilter(corrected.eset, var.func=IQR, var.cutoff=0.5, filterByQuantile=TRUE)
filteredExprs <- exprs(filtered.eset)
# add codes to the expression set
filtered.eset$code <- patientCodes
# read in annotation data
getAnnotation <- function() {
geo_sheet <- read.delim('GPL15207-14509.txt', header=TRUE, sep='\t', skip=24)
probes <- dplyr::select(geo_sheet, ID, Gene.Title, Gene.Symbol, Chromosomal.Location, Entrez.Gene, SwissProt, OMIM)
return(probes)
}
annoData <- getAnnotation()