# 17 Dec 2010 # From Mark Robinson, 18 October 2010 # Ongoing help from Mark... # Run this script by e.g. # source(file="/projects/remc_bigdata/Karsan/R/20101218/bammap2counts_DE-H4ac.20101218.R") # source("http://bioconductor.org/biocLite.R") # old.packages(repos=biocinstallRepos()) # update.packages(repos=biocinstallRepos(), ask=FALSE) #install.packages("Repitools",repos="http://r-forge.r-project.org",type="source") # biocLite("Rsamtools") 1.2.0 # biocLite("BSgenome.Hsapiens.UCSC.hg18") 834 M as a tar.gz # biocLite("BSgenome.Mmusculus.UCSC.mm8") 747MB # R CMD REMOVE BSgenome.Hsapiens.UCSC.hg18 # R CMD INSTALL /projects/remc_bigdata/Data/BSgenome.Hsapiens.UCSC.hg18_1.3.16.tar.gz # load packages #library(IRanges) #library(Biostrings) #library(GenomicRanges) #library(IRanges) library(Rsamtools) library(Repitools) library(edgeR) library(BSgenome.Hsapiens.UCSC.hg18) cat("some parameter settings\n") # need to add a failed-chastity filter and a MAPQ filter #p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) p <- ScanBamParam(what=c( "rname", "strand", "pos", "mapq" ), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) fragSize <- 200 # 1000, 750, 500 bp windows #windowSize <- 2500 windowSize <- 1000 #windowSize <- 750 #windowSize <- 500 cat("fixed bins:",windowSize,"bp; fragSize:",fragSize,"bp\n") # human chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "") #print(paste("w=",windowSize,sep="") #print(paste("fragSize=",fragSize,sep="")) # file names (here, BAM format) --> list of GRanges # HS1238 is 50 bp, HS1235 is 75 bp, both are SE reads # Compare HS1238 then HS1235 ############################################################################## #readLen <- c(50,75) # vector to match filenames #filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", # "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam") ############################################################################## # 16 Dec 2010: 4 lanes of data #readLen <- c(50,75,50,50) # vector to match filenames #filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", # "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam", # "/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam", # "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam" ) ############################################################################## # 18 Dec 2010 - only lane-2 data readLen <- c(50,50) # vector to match filenames filenames <- c("/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam", "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam") ############################################################################## # A multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, # the second elements, the third elements, and so on. Arguments are recycled if necessary. cat("read in .bam files, filter reads\n") ############################################################################## #gr <- mapply(FUN=function(u,v) { # sb <- scanBam(u, param=p)[[1]] # GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand) #},filenames,readLen) ############################################################################## # 16 Dec, Mark Robinson: add MAPQ filter here gr <- mapply(FUN=function(u,v) { sb <- scanBam(u, param=p)[[1]] w <- sb$mapq > 10 # SET CUTOFF HERE cat(u, ": Read", length(sb$rname), "mapped non-dup. reads,", round(mean(w)*100,2), "percent with MAPQ>10\n") GRanges( seqnames=paste("chr",sb$rname[w],sep=""), ranges=IRanges(start=sb$pos[w],width=v), strand=sb$strand[w] ) },filenames,readLen) ############################################################################## #names(gr) <- c("HS1238_kd","HS1235_ctl","HS1238_kd_2","HS1235_ctl_2") #names(gr) <- c("HS1238_kd","HS1235_ctl") # lane 1 names(gr) <- c("HS1238_kd_2","HS1235_ctl_2") # lane 2 grl <- GRangesList(gr) cat("create table of windows along the genome\n") windows <- genomeBlocks(Hsapiens, chrNames, windowSize) head(windows) #RangedData with 6 rows and 0 value columns across 24 spaces cat("create count table after extending reads to be length fragSize\n") counts <- annotationBlocksCounts(grl, windows, fragSize, verbose=TRUE) # MR: If colSums are 0, we'll have to fix the chromosome names problem. colSums(counts) # HS1238_kd HS1235_ctl # 19563732 26082144 cat("filter on the total count for a bin (10)\n") k <- rowSums(counts) > 10 cat("Get normalizing factors for edgeR\n") nf <- calcNormFactors(counts[k,]) ### Original way ############################################################# # 18 Dec 2010 cat("estimate common dispersion as 'd'\n") d <- DGEList(counts=counts[k,], group=colnames(counts), lib.size=colSums(counts[k,])*nf, genes=as.data.frame(windows)[k,1:3]) d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning cat(windowSize,"bp bins: d$common.dispersion=",d$common.dispersion,"\n") ############################################################################## # Mark Robinson, 08 December 2010 - estimating a dispersion when you have no replicates #print("estimate common dispersion as 'dhack'") #dhack <- d <- DGEList(counts=counts[k,], # group=colnames(counts), # lib.size=colSums(counts[k,])*f, # genes=as.data.frame(windows)[k,1:3]) #dhack$samples$group <- factor(c("A","A")) #dhack <- estimateCommonDisp(dhack) #d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning #d$common.dispersion <- dhack$common.dispersion #cat(windowSize,"bp bins: dhack$common.dispersion=",dhack$common.dispersion,"\n") ############################################################################## ### 4 lanes, as replicates, 16 Dec 2010 ##################################### #cat("\n","process as replicates","\n") #d.rep <- DGEList(counts=counts[k,], # group=c("HS1238_kd","HS1235_ctl","HS1238_kd","HS1235_ctl"), # lib.size=colSums(counts[k,])*f, # genes=as.data.frame(windows)[k,1:3]) # #genes=anno[k,1:3]) #d.rep <- estimateCommonDisp(d.rep) #cat("d.rep$common.dispersion=",d.rep$common.dispersion,"\n") # this can be estimated now, but should be v. small - d.rep$common.dispersion= 0.06875074 ############################################################################## cat("exactTest()\n") de <- exactTest(d, pair=c("HS1235_ctl","HS1238_kd")) topTags(de) xx <- cbind(as.data.frame(windows)[k,-5], counts[k,], de$table,adjp=p.adjust(de$table$p.value, method="BH") ) #xx <- cbind(anno[k,-5], counts[k,], id=rownames(de.rep$table), de.rep$table,adjp=p.adjust(de.rep$table$p.value, method="BH") ) cat("write.table, all records, regardless of p-val\n") write.table(xx, "/projects/remc_bigdata/Karsan/R/20101220/HS1238_vs_1235.lane-2-only.mapq10-noDups.w1000.edgeR.all.20101220.txt", sep="\t", row.names=FALSE, quote=FALSE) #cat("save.image()\n") #save.image("Robjects_w1000_2010-12-18-Vancouver.Rdata") # save all objects in a single file ### Saturation analysis ###################################################### # Mark Robinson, 20 Dec 2010 downSamp <- function(d, fr=.5) { g <- rownames(d) newd <- matrix(0,nr=nrow(d), nc=ncol(d), dimnames=dimnames(d)) for(j in 1:ncol(d)) { cat(".") s <- sample( rep(g,d[,j]), round(fr*sum(d[,j])) ) #s <- sample(g, f*sum(d[,j]), prob=d[,j], replace=TRUE) ts <- table(s) newd[names(ts),j] <- ts } cat("\n") newd } print("do subsampling analysis (downsampling counts).") nSamps <- 10 # choose number of samples to take at each proportion subs <- seq( 0.3, 1, by=0.05 ) # choose resolution here names(subs) <- subs dir <- nDE <- lapply(subs, function(u) matrix(0,nrow=nrow(d$counts),ncol=nSamps)) print(date()) for(i in 1:length(subs)) { for(j in 1:nSamps) { DD <- downSamp(d$counts, subs[i]) nfSub <- calcNormFactors(DD) dSub <- DGEList(counts=DD, group=colnames(counts), lib.size=colSums(DD)*nfSub) dSub <- estimateCommonDisp(dSub) ### set this to select lane-1 or lane-2 #deSub <- exactTest(dSub,pair=c("HS1235_ctl","HS1238_kd")) deSub <- exactTest(dSub,pair=c("HS1235_ctl_2","HS1238_kd_2")) ### nDE[[i]][,j] <- deSub$table$p.value dir[[i]][,j] <- sign(deSub$table$logFC) } cat("------ Downsampling at", subs[i],"finished ----------------------\n") } print(date()) ############################################################################## print("generate saturation boxplot for FDR<0.05") fdr <- 0.05; z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) pdf("/projects/remc_bigdata/Karsan/R/20101220/lane2.downSampling-counts.w1000.adjP-LE-0p05.20101220.pdf",w=6,h=6); boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), col=rep(c("blue","green"),each=length(subs)),boxwex=.04, xlab="Percent of dataset", ylab="Number of TSS regions with adjP<0.05", names=NULL, xaxt="n", main="-2000/+1000bp mrgd UCSC gene TSSs (L2)", ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) axis(1) grid(lty="dashed") legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) dev.off()