diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d62717f --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +all: git + +git: + git add -A + git commit -am "update package" + git push diff --git a/R/analyze.R b/R/analyze.R index f3ec98e..9965912 100644 --- a/R/analyze.R +++ b/R/analyze.R @@ -33,12 +33,23 @@ -analyze <- function(i = double(), j = double(), mode = "default", path.base = "/scratch/hpc2862/CAMH/perm_container/container_", summary.file = "/scratch/hpc2862/CAMH/perm_container/snp_summary2.out", output = "~/repos/coR-ge/data/test_run2.txt", test = TRUE, safe = TRUE, local = FALSE, h2 = 0.45, pc = 0.5, pnc = 0.5, nc = 1000, gen = NULL, summary = summary, maf = FALSE, maf_range = NULL){ - - -#message(paste0("coR-ge v", packageVersion("coRge"), " \t \t http://github.io/Chris1221/coR-ge \n \n \t Parameters in use: \n \t \t i: ",i, "\n \t \t j: ", j, "\n \t \t path.base: ", path.base,"\n \t \t summary.file: ", summary.file,"\n \t \t output: ", output,"\n \t \t th2: ", h2,"\n \t \t pc: ", pc,"\n \t \t pnc: ", pnc,"\n \t \t nc: ", nc, "\n \n \t Local options: \n \t \t local: ", local,"\n \t \t gen ", gen,"\n \t \t summary: ", summary, " \n \n \t Testing options: \n \t \t test: ", test,"\n \t \t safe: ", safe, "\n \n LOG: \n")) - - +analyze <- function(i = double(), + j = double(), + mode = "default", + path.base = "/scratch/hpc2862/CAMH/perm_container/container_", + summary.file = "/scratch/hpc2862/CAMH/perm_container/snp_summary2.out", + output = "~/repos/coR-ge/data/test_run2.txt", + test = TRUE, + safe = TRUE, + local = FALSE, + h2 = 0.45, + pc = 0.5, + pnc = 0.5, + nc = 1000, + gen = NULL, + summary = summary, + maf = FALSE, + maf_range = NULL){ if(local) { @@ -86,7 +97,7 @@ analyze <- function(i = double(), j = double(), mode = "default", path.base = "/ # ___ __ _ _ # / \ ___ / _| __ _ _ _ | || |_ # / /\ / / _ \| |_ / _` || | | || || __| -# / /_// | __/| _|| (_| || |_| || || |_ +# / /_// | __/| _|| (_| || |_| || || |_ # /___,' \___||_| \__,_| \__,_||_| \__| # diff --git a/inst/analyses/Makefile b/inst/analyses/Makefile index e69de29..3694d87 100644 --- a/inst/analyses/Makefile +++ b/inst/analyses/Makefile @@ -0,0 +1 @@ +.PHONY: sim_gen diff --git a/inst/analyses/new_ref/Makefile b/inst/analyses/new_ref/Makefile new file mode 100644 index 0000000..d216c61 --- /dev/null +++ b/inst/analyses/new_ref/Makefile @@ -0,0 +1,59 @@ +## This analysis is centered around using a new reference panel with more SNPs +# hopefully attenuating the increase in false positives that we have been +# observing in the previous analyses. +# +# This is going hopefully towards publication eventually. + +# Set up some variables first. +home=/scratch/hpc2862/CAMH/new_corge +hapgen2=$(home)/bin/hapgen2 + +# The default rule +# For now just update the git repo. +.PHONY: all +all: $(home)/summary/stats.txt + +# Submission +# +sub: + qsub -N "new_run" sub.sh + +# Git add rule +# +git: + cd ../../.. && git add -A + git commit -am "update coRge for new analysis see changelog" + #git push + +# Download and unpack the reference files. +$(home)/ref/.done: + cd $(home)/ref && wget https://mathgen.stats.ox.ac.uk/impute/1000GP_Phase3.tgz + cd $(home)/ref && tar -xvzf 1000GP_Phase3.tgz + cd $(home)/ref/1000GP_Phase3 && mv * ../ + cd $(home)/ref && rm -rf 1000GP_Phase3.tgz 1000GP_Phase3 + touch $(home)/ref/.done # Done + +# This is the simulation protocol +# Note that the target is a placeholder generated at the end of the simulation +# as opposed to any of the actual files for the sake of simplicity. +$(home)/sim/.done: $(home)/ref/.done + mkdir -p $(home)/sim + cd $(home)/ref && $(hapgen2) -h 1000GP_Phase3_chr1.hap \ + -l 1000GP_Phase3_chr1.legend \ + -m genetic_map_chr1_combined_b37.txt \ + -dl 723891 1 1.0 1.0 \ + -n 1000 \ + -int 0 500000000 \ + -o ../sim/chr1_sim1 + +$(home)/summary/stats.txt: $(home)/ref/.done + mkdir -p $(home)/summary + $(home)/bin/snptest -summary_stats_only \ + -data $(home)/sim/chr1_sim1.controls.gen $(home)/sim/chr1_sim1.controls.sample \ + -o $(home)/summary/stats.txt + +summary.html: summary.Rmd + Rscript -e 'rmarkdown::render("$<")' + + + diff --git a/inst/analyses/new_ref/run_coRge.R b/inst/analyses/new_ref/run_coRge.R new file mode 100644 index 0000000..9590f31 --- /dev/null +++ b/inst/analyses/new_ref/run_coRge.R @@ -0,0 +1,19 @@ +library(coRge) +library(data.table) + +gen <- fread("/scratch/hpc2862/CAMH/new_corge/sim/chr1_sim1.controls.gen", + h = F, + sep = " ", + data.table = F) + +summary <- fread("/scratch/hpc2862/CAMH/new_corge/summary/stats.txt", + h = T, + sep = " ") + +analyze(i=1, + j = 1, + nc = 1000, + gen = gen, + summary = summary, + local = T, + output ="/scratch/hpc2862/CAMH/new_corge/results/run2.txt") diff --git a/inst/analyses/new_ref/selected_regions.R b/inst/analyses/new_ref/selected_regions.R new file mode 100644 index 0000000..9b84883 --- /dev/null +++ b/inst/analyses/new_ref/selected_regions.R @@ -0,0 +1,113 @@ +#!/usr/bin/Rscript + +# ------------------------------------------------------------------------------- # +# +# Step 1: Take input options from command line +# Currently assuming that only i and j vary but the others can too +# Maybe put this into the makefile directly +# +# ------------------------------------------------------------------------------- # + + +library(dplyr) +library(devtools) +library(data.table) +library(Rcpp) +library(RcppArmadillo) + +#install_github("Chris1221/coR-ge", ref = "devel") +library(coRge) + +test <- FALSE + +args <- commandArgs(TRUE) + +i <- args[1] +j <- args[2] + +path.base <- "/scratch/hpc2862/CAMH/perm_container/container_" + +# ------------------------------------------------------------------------------- # +# +# Step 2: Read in gen and summary files +# This has been removed from the main function to allow looping over analyze(local) +# Perhaps try to save as an R binary file, but not now. +# +# ------------------------------------------------------------------------------- # + +path <- paste0(path.base,i,"_",j,"/") +setwd(path) + +list.files(path)[!grepl("controls.gen", list.files(path))] %>% + file.remove + +if(!test){ + + for(k in 1:5){ + if(k == 1){ + fread(paste0(path, "chr1_block_", i, "_perm_", j, "_k_", k, ".controls.gen"), h = F, sep = " ") %>% as.data.frame() -> gen + } else if(k != 1){ + fread(paste0(path, "chr1_block_", i, "_perm_", j, "_k_", k, ".controls.gen"), h = F, sep = " ") %>% as.data.frame() %>% select(.,-V1:-V5) %>% cbind(gen, .) -> gen + } +} + +} else if(test){ + k <- 1 + + gen <- fread(paste0(path, "chr1_block_", i, "_perm_", j, "_k_", k, ".controls.gen"), h = F, sep = " ") %>% as.data.frame() +} + +colnames(gen) <- paste0("V",1:ncol(gen)) + +summary <- fread("/scratch/hpc2862/CAMH/perm_container/snp_summary2.out", h = T) + +h2 = 0.5 +pc = 0.75 +pnc = 0.2 +nc = 500 + + +for( var in c(50, 500, 5000) ){ + + analyze(i = i, + j = j, + h2 = h2, + pc = pc, + pnc = pnc, + nc = var, + local = TRUE, + gen = gen, + summary = summary, + mode = "ld", + output = "~repos/coR-ge/data/raw/new_panels.txt") +} + +for( var in c(0.1, 0.5, 0.9) ){ + + analyze(i = i, + j = j, + h2 = i, + pc = pc, + pnc = pnc, + nc = var, + local = TRUE, + gen = gen, + summary = summary, + mode = "ld", + output = "~repos/coR-ge/data/raw/new_panels.txt") +} + +for( var in c(0.5,0.8, 0.99) ){ + + analyze(i = i, + j = j, + h2 = h2, + pc = var, + pnc = pnc, + nc = i, + local = TRUE, + gen = gen, + summary = summary, + mode = "ld", + output = "~repos/coR-ge/data/raw/new_panels.txt") +} diff --git a/inst/analyses/new_ref/sub.sh b/inst/analyses/new_ref/sub.sh new file mode 100644 index 0000000..10ae6db --- /dev/null +++ b/inst/analyses/new_ref/sub.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#$ -S /bin/bash +#$ -q abaqus.q +#$ -l qname=abaqus.q +#$ -cwd +#$ -V +#$ -l hostname=sw0050 +#$ -j y +#$ -o /home/hpc2862/repos/coR-ge/inst/logs/$JOB_NAME.txt + +cd /home/hpc2862/repos/coR-ge/inst/analyses/new_ref + +Rscript run_coRge.R diff --git a/inst/analyses/new_ref/summary.Rmd b/inst/analyses/new_ref/summary.Rmd new file mode 100644 index 0000000..f0f78f2 --- /dev/null +++ b/inst/analyses/new_ref/summary.Rmd @@ -0,0 +1,27 @@ +--- +title: "Looking at the new panel" +author: "" +--- + +```{r setup, include = F} +library(dplyr) +``` + + +First we might want to look at a simple summary of the MAF to see if there are any super weird bins that might be affecing the generation of scores to make them abnormally high or low. + +```{r, message = F} +table <- data.table::fread("/scratch/hpc2862/CAMH/new_corge/summary/stats.txt", h = T) +``` + +```{R} +hist(table$all_maf) +``` + +Obviously this is nowhere near Uniform. We're going to git rid of all of the MAF below 0.01 and try again. + +```{R} +table %>% filter(all_maf > 0.01) %>% select(all_maf) %>% unlist %>% as.double %>% hist +``` + +Once we get rid of the rare variants then the MAFs look more reasonable. We're going to go forward and trim them like this going forward. diff --git a/inst/analyses/new_ref/summary.html b/inst/analyses/new_ref/summary.html new file mode 100644 index 0000000..ae7bdbd --- /dev/null +++ b/inst/analyses/new_ref/summary.html @@ -0,0 +1,184 @@ + + + + + + + + + + + + + +Looking at the new panel + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + +

First we might want to look at a simple summary of the MAF to see if there are any super weird bins that might be affecing the generation of scores to make them abnormally high or low.

+
table <- data.table::fread("/scratch/hpc2862/CAMH/new_corge/summary/stats.txt", h = T)
+
## 
+Read 2.6% of 6461692 rows
+Read 5.9% of 6461692 rows
+Read 9.0% of 6461692 rows
+Read 12.5% of 6461692 rows
+Read 15.0% of 6461692 rows
+Read 17.2% of 6461692 rows
+Read 21.5% of 6461692 rows
+Read 26.8% of 6461692 rows
+Read 33.1% of 6461692 rows
+Read 37.5% of 6461692 rows
+Read 40.7% of 6461692 rows
+Read 49.8% of 6461692 rows
+Read 58.8% of 6461692 rows
+Read 60.7% of 6461692 rows
+Read 66.9% of 6461692 rows
+Read 73.8% of 6461692 rows
+Read 83.6% of 6461692 rows
+Read 89.5% of 6461692 rows
+Read 96.4% of 6461692 rows
+Read 6461692 rows and 21 (of 21) columns from 0.583 GB file in 00:00:37
+
## Warning in data.table::fread("/scratch/hpc2862/CAMH/new_corge/summary/
+## stats.txt", : Stopped reading at empty line 6461702 but text exists
+## afterwards (discarded): # Completed successfully at 2017-01-07 15:07:30
+
hist(table$all_maf)
+

+

Obviously this is nowhere near Uniform. We’re going to git rid of all of the MAF below 0.01 and try again.

+
table %>% filter(all_maf > 0.01) %>% select(all_maf) %>% unlist %>% as.double %>% hist
+

+ + + + +
+ + + + + + + + diff --git a/inst/pipelines/bigrun.R b/inst/pipelines/bigrun.R index 87587ba..f776dd0 100644 --- a/inst/pipelines/bigrun.R +++ b/inst/pipelines/bigrun.R @@ -83,3 +83,5 @@ library(foreach) analyze(i = i, j = j, h2 = h2, pc = pc, pnc = pnc, nc = nc, local = TRUE, gen = gen, summary = summary, mode = "ld", output = "~repos/coR-ge/data/raw/pri2.txt") } + + diff --git a/inst/scope/Makefile b/inst/scope/Makefile new file mode 100644 index 0000000..92f9b4c --- /dev/null +++ b/inst/scope/Makefile @@ -0,0 +1,5 @@ +scope.pdf: scope.tex + latexmk -pv -pdf $< + latexmk -c + git add -A + git commit -am "update scope" diff --git a/inst/scope/scope.pdf b/inst/scope/scope.pdf new file mode 100644 index 0000000..aac34d1 Binary files /dev/null and b/inst/scope/scope.pdf differ diff --git a/inst/scope/scope.tex b/inst/scope/scope.tex new file mode 100644 index 0000000..81bd81b --- /dev/null +++ b/inst/scope/scope.tex @@ -0,0 +1,10 @@ +\documentclass{article} +\begin{document} + +\title{Scope of the Project} + +\maketitle + +We're looking to bring this to publication. + +\end{document}