# 10 Jan 2011, 08h30 # # Have used knownGene many times. Now try to use 'wgRna' table for miRNAs # # Code from Mark Robinson, 13, 16 Dec 2010 # source("/projects/remc_bigdata/Karsan/R/20110110/edgeR_TSS-rgns_DE-H4ac.20110110.miRNA.R") #install.packages("/home/grobertson/linux_x86_64/R/Rsamtools_1.2.2.tar.gz",type="source") #install.packages("/home/grobertson/linux_x86_64/R/IRanges_1.8.7.tar.gz",type="source") ############################################################################## options(stringsAsFactors=FALSE) library(rtracklayer) session <- browserSession("UCSC") genome(session) <- "hg18" #trackNames(session) ## list the track names # KnownGene used many times, but it does not include miRNA! qKG <- ucscTableQuery(session, "knownGene") cat("getting UCSC knownGene table:",date(),"\n") knownGene <- getTable(qKG) # NEW - 09 Jan 2011 qMiRNA <- ucscTableQuery(session, "wgRna") cat("getting UCSC 'wgRna' table:",date(),"\n") miRNA <- getTable(qMiRNA) cat("done",date(),"\n") # merge TSSs with identical starts knownGene$position <- ifelse(knownGene$strand=="+", knownGene$txStart, knownGene$txEnd) # wgRna table: {bin, chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, type} miRNA$position <- ifelse(miRNA$strand=="+", miRNA$chromStart, miRNA$chromEnd) # Choose which to work with: knownGene or miRNA keyKg <- paste(knownGene$chrom, knownGene$position, sep=":") ukeyKg <- unique(keyKg) mKg <- match(ukeyKg, keyKg) mkgKg <- knownGene[mKg,c("chrom","strand","txStart","txEnd","name","position")] # mkg = merged known genes # keyMiR <- paste(miRNA$chrom, miRNA$position, sep=":") ukeyMiR <- unique(keyMiR) mMiR <- match(ukeyMiR, keyMiR) mkgMiR <- miRNA[mMiR,c("chrom","strand","chromStart","chromEnd","name","position")] # mkg = merged miRNA genes # create merged IDs rownames(mkgKg) <- ukeyKg rownames(mkgMiR) <- ukeyMiR # Choose which to work with: knownGene or miRNA idsKg <- split(knownGene$name, keyKg) collapsedidsKg <- sapply(idsKg, paste, collapse=",") mkgKg$newID <- "" mkgKg[names(collapsedidsKg),"newID"] <- collapsedidsKg idsMiR <- split(miRNA$name, keyMiR) collapsedidsMiR <- sapply(idsMiR, paste, collapse=",") mkgMiR$newID <- "" mkgMiR[names(collapsedidsMiR),"newID"] <- collapsedidsMiR # merge nearby TSSs tssTol <- 500 # if distance b/w TSSs is less than this, they are merged splitTol <- 250000 lockey <- paste(mkg$chrom, mkg$strand, sep=":") ind <- seq_len(nrow(mkg)) inds <- split(ind, lockey) clustno <- lapply(inds, FUN=function(u) { w <- diff(mkg$position[u]) > splitTol z <- cumsum(c(0,w)) s <- split(u, z) cat(".") clusts <- lapply(s, FUN=function(uu) { if(length(uu)==1) return(1) d <- dist(mkg$position[uu]) h <- hclust(d,"ave") cutree(h,h=tssTol) }) paste( rep(names(s), sapply(s,length)), unlist(clusts, use.names=FALSE), sep=".") }) clustno <- unsplit( clustno, lockey ) clustkey <- paste( mkg$chrom, mkg$strand, clustno, sep=":" ) uckey <- unique(clustkey) mc <- match(uckey, clustkey) mkg1 <- mkg[mc,] rownames(mkg1) <- uckey ids <- split(mkg$newID, clustkey) collapsedids <- sapply(ids, paste, collapse=";") mkg1$newID <- "" mkg1[names(collapsedids),"newID"] <- collapsedids rownames(mkg1) <- NULL # Choose which to work with: knownGene or miRNA #anno <- data.frame(chr=mkg1$chrom, strand=mkg1$strand, start=mkg1$txStart, end=mkg1$txEnd, name=mkg1$name, allIDs=mkg1$newID) anno <- data.frame(chr=mkg1$chrom, strand=mkg1$strand, start=mkg1$chromStart, end=mkg1$chromEnd, name=mkg1$name, allIDs=mkg1$newID) # generate unique 'name' column anno$name <- paste(anno$name, anno$chr, anno$start, sep=":") write.table(anno, "/projects/remc_bigdata/Karsan/R/20110109/wgRna.merged-TSSs-d500.20110110.txt", sep="\t", row.names=FALSE, quote=FALSE) cat("\nTSS's are merged:",date(),"\n") #quit() ############################################################################## cat("read in BAM files\n") library(Rsamtools) library(Repitools) library(edgeR) library(BSgenome.Hsapiens.UCSC.hg18) cat("set scanBam parameters\n") # need to add a failed-chastity filter and a MAPQ filter #p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) # 16 Dec 2010, filter by MAPQ p <- ScanBamParam(what=c("rname", "strand", "pos", "mapq"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "") fragSize <- 200 ############################################################################## ### set below to select lane-1 or lane-2 ############################################################################## readLen <- c(50,75) # vector to match filenames filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam") ############################################################################## # 16 Dec 2010 #readLen <- c(50,75,50,50) # vector to match filenames #filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", # "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam", # "/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam", # "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam") ############################################################################## # 18 Dec 2010 - just 'lane-2' data #readLen <- c(50,50) # vector to match filenames #filenames <- c("/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam", # "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam") ############################################################################## # 16 Dec 2010: old way without filtering by MAPQ #gr <- mapply(FUN=function(u,v) { # sb <- scanBam(u, param=p)[[1]] # GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand) # },filenames,readLen) #names(gr) <- c("HS1238_kd","HS1235_ctl","HS1238_kd_2","HS1235_ctl_2") #names(gr) <- c("HS1238_kd_2","HS1235_ctl_2") ############################################################################## # 16 Dec, Mark Robinson: add MAPQ filter here # 18 Dec 2010 - use for lane-2 data cat("now read in .bam files \n") gr <- mapply(FUN=function(u,v) { sb <- scanBam(u, param=p)[[1]] w <- sb$mapq > 10 # SET CUTOFF HERE cat(u, ": Read", length(sb$rname), "mapped non-dup. reads,", round(mean(w)*100,2), "percent with MAPQ > 10\n") GRanges(seqnames=paste("chr",sb$rname[w],sep=""), ranges=IRanges(start=sb$pos[w],width=v), strand=sb$strand[w]) },filenames,readLen) ### set this to select lane-1 or lane-2 names(gr) <- c("HS1238_kd","HS1235_ctl") #names(gr) <- c("HS1238_kd","HS1235_ctl","HS1238_kd_2","HS1235_ctl_2") #names(gr) <- c("HS1238_kd_2","HS1235_ctl_2") ############################################################################## grl <- GRangesList(gr) #cat("TSS regions: get counts\n") #cat("bpUp=1500, bpDown=1000", "\n") #counts <- annotationCounts(grl, anno, bpUp=1500, bpDown=1000, seqLen=fragSize, verbose=TRUE) cat("bpUp=2000, bpDown=1000", "\n") counts <- annotationCounts(grl, anno, bpUp=2000, bpDown=1000, seqLen=fragSize, verbose=TRUE) #cat("bpUp=2500, bpDown=1000", "\n") #counts <- annotationCounts(grl, anno, bpUp=2500, bpDown=1000, seqLen=fragSize, verbose=TRUE) #cat("bpUp=2500, bpDown=1500", "\n") #counts <- annotationCounts(grl, anno, bpUp=2500, bpDown=1500, seqLen=fragSize, verbose=TRUE) colSums(counts) #HS1238_kd HS1235_ctl # 3070716 3288149 k <- rowSums(counts) > 10 nf <- calcNormFactors(counts[k,]) ############################################################################## #print("estimate common dispersion as 'dhack'") #dhack <- d <- DGEList(counts=counts[k,], # group=colnames(counts), # lib.size=colSums(counts[k,])*f, # genes=anno[k,1:3]) # #genes=as.data.frame(windows)[k,1:3]) #dhack$samples$group <- factor(c("A","A")) #dhack <- estimateCommonDisp(dhack) #d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning #d$common.dispersion <- dhack$common.dispersion #cat("TSSrgns: dhack$common.dispersion=",dhack$common.dispersion,"\n") ############################################################################## # Old way, without 'dhack', for a single run pair # 18 Dec 2010 - resusing this for lane-2 data d <- DGEList(counts=counts[k,], group=colnames(counts), lib.size=colSums(counts[k,])*nf, genes=anno[k,1:3]) d <- estimateCommonDisp(d) cat("d$common.dispersion=",d$common.dispersion,"\n") print("exactTest()") de <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd")) #de <- exactTest(d,pair=c("HS1235_ctl_2","HS1238_kd_2")) #topTags(de) xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table, adjp=p.adjust(de$table$p.value, method="BH") ) print("write.table, all records, regardless of p-val") #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101226/HS1238_vs_1235.lane-1.mapq10-noDups.knownGene.merge-500.2p5kb-TSS-1p5kb.mapq10_noDups.edgeR.all.20101218.txt", sep="\t", row.names=FALSE, quote=FALSE) write.table(xx, "/projects/remc_bigdata/Karsan/R/20110109/HS1238_vs_1235.lane-1.mapq10-noDups.wgRna.merged-TSSs-d500.2kb-TSS-1kb.edgeR.all.20110109.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101226/HS1238_vs_1235.lane-1.mapq10-noDups.knownGene.merged-TSSs-d500.2kb-TSS-1kb.mapq10_noDups.edgeR.all.20101226.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101221/HS1238_vs_1235.lane-1.mapq10-noDups.knownGene.merge-500.1p5-TSS-1kb.edgeR.all.20101221a.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101218/HS1238_vs_1235.lane-1.mapq10-noDups.knownGene.merge-500.2kb-TSS-1kb.mapq10_noDups.edgeR.all.20101218.txt", sep="\t", row.names=FALSE, quote=FALSE) #print(date()) #cat("save.image()\n") #save.image("Robjects.2p5kb-TSS-1p5kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file #save.image("Robjects.2p5kb-TSS-1kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file #save.image("Robjects.2kb-TSS-1kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file #save.image("Robjects.1.5kb-TSS-1kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file quit() ############################################################################## ############################################################################## downSamp <- function(d, fr=.5) { g <- rownames(d) newd <- matrix(0,nr=nrow(d), nc=ncol(d), dimnames=dimnames(d)) for(j in 1:ncol(d)) { cat(".") s <- sample( rep(g,d[,j]), round(fr*sum(d[,j])) ) #s <- sample(g, f*sum(d[,j]), prob=d[,j], replace=TRUE) ts <- table(s) newd[names(ts),j] <- ts } cat("\n") newd } print("do subsampling analysis (downsampling counts).") nSamps <- 10 # choose number of samples to take at each proportion subs <- seq( 0.3, 1, by=0.05 ) # choose resolution here names(subs) <- subs #dir <- nDE <- lapply(subs, function(u) matrix(0,nrow=nrow(d$counts),ncol=nSamps)) #print(date()) #for(i in 1:length(subs)) { # for(j in 1:nSamps) { # DD <- downSamp(d$counts, subs[i]) # nfSub <- calcNormFactors(DD) # NEW # dSub <- DGEList(counts=DD, group=colnames(counts), lib.size=colSums(DD)*nfSub) # NEW # #dSub <- DGEList(counts=DD, group=colnames(counts), lib.size=colSums(DD)*nf) # OLD # dSub <- estimateCommonDisp(dSub) # ### set this to select lane-1 or lane-2 # deSub <- exactTest(dSub,pair=c("HS1235_ctl","HS1238_kd")) # lane-1 # #deSub <- exactTest(dSub,pair=c("HS1235_ctl_2","HS1238_kd_2")) # lane-2 # ### # nDE[[i]][,j] <- deSub$table$p.value # dir[[i]][,j] <- sign(deSub$table$logFC) # } # cat("------ Finished downsampling counts at", subs[i],"----------------------\n") #} #print(date()) ############## #print("generate saturation boxplot for FDR<0.05") #fdr <- 0.05; #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # #pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LE-0p05.20101221.pdf",w=6,h=6); #boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), # col=rep(c("blue","green"),each=length(subs)),boxwex=.04, # xlab="Percent of dataset", # ylab="Number of TSS regions with adjP<0.05", # names=NULL, xaxt="n", # main="-1500/+1000bp mrgd UCSC gene TSSs (L1)", # ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) #axis(1) #grid(lty="dashed") #legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) #dev.off() # ############### #print("generate saturation boxplot for FDR<0.01") #fdr <- 0.01; #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # #pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LE-0p01.20101221.pdf",w=6,h=6); #boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), # col=rep(c("blue","green"),each=length(subs)),boxwex=.04, # xlab="Percent of dataset", # ylab="Number of TSS regions with adjP<0.01", # names=NULL, xaxt="n", # main="-1500/+1000bp mrgd UCSC gene TSSs (L1)", # ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) #axis(1) #grid(lty="dashed") #legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) #dev.off() # ############### #print("generate saturation boxplot for FDR<1e-3") #fdr <- 0.001; #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # #pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LT-1e-3.20101221.pdf",w=6,h=6); #boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), # col=rep(c("blue","green"),each=length(subs)),boxwex=.04, # xlab="Percent of dataset", # ylab="Number of TSS regions with adjP<0.001", # names=NULL,xaxt="n", # main="-1500/+1000bp mrgd UCSC gene TSS's (L1)", # ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) #axis(1) #grid(lty="dashed") #legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) #dev.off() # ############### #print("generate saturation boxplot for FDR<1e-4") #fdr <- 0.0001; #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # #pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LT-1e-4.20101221.pdf",w=6,h=6); #boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), # col=rep(c("blue","green"),each=length(subs)),boxwex=.04, # xlab="Percent of dataset", # ylab="Number of TSS regions with adjP<1e-4", # names=NULL,xaxt="n", # main="-1500/+1000bp mrgd UCSC gene TSSs (L1)", # ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) #axis(1) #grid(lty="dashed") #legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) #dev.off() # ############### #print("generate saturation boxplot for FDR<1e-6") #fdr <- 0.000001; #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # #pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LT-1e-6.20101221.pdf",w=6,h=6); #boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), # col=rep(c("blue","green"),each=length(subs)),boxwex=.04, # xlab="Percent of dataset", # ylab="Number of TSS regions with adjP<1e-6", # names=NULL,xaxt="n", # main="-1500/+1000bp mrgd UCSC gene TSSs (L1)", # ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) #axis(1) #grid(lty="dashed") #legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) #dev.off() ############################################################################## ############################################################################## # Mark, 20 Dec 2010 - much slower, downsamples from GRL downSampGRL <- function(grl, fr=.05) { grlSub <- grl ns <- elementLengths(grl) for(j in 1:length(grlSub)) { s <- sort(sample( ns[j], round(fr*ns[j]) )) grlSub[[j]] <- grlSub[[j]][s] } # This is for TSS regions annotationCounts(grlSub, anno, bpUp=1500, bpDown=1000, seqLen=fragSize, verbose=TRUE) } ### Set up loops ######################################## print("do subsampling analysis (downsampling GRL).") nSamps <- 5 # choose number of samples to take at each proportion subs <- seq( 0.3, 1, by=0.1 ) # choose resolution here #subs <- seq( 0.5, 0.7, by=0.2 ) # choose resolution here names(subs) <- subs ### END: Set up loops ################################## dir.reads <- nDE.reads <- lapply( subs, function(u) matrix(0,nrow=nrow(anno),ncol=nSamps) ) print(date()) for(i in 1:length(subs)) { for(j in 1:nSamps) { ################################ cat("before downSampGRL ...") DD <- downSampGRL(grl, subs[i]) cat("after downSampGRL \n") ################################ nfSub <- calcNormFactors(DD) dSub <- DGEList( counts=DD, group=colnames(counts), lib.size=colSums(DD)*nfSub ) dSub <- estimateCommonDisp(dSub) ### Need to set the next line for the right pair of data sets deSub <- exactTest(dSub,pair=c("HS1235_ctl","HS1238_kd")) #deSub <- exactTest( dSub, pair=c("HS1235_ctl_2","HS1238_kd_2") ) ### nDE.reads[[i]][,j] <- deSub$table$p.value dir.reads[[i]][,j] <- sign(deSub$table$logFC) } cat("------ Finished downsampling reads at", subs[i],"----------------------\n") } print(date()) ############################################################################## print("generate saturation boxplot for FDR<0.05") fdr <- 0.05; ### Set the next two lines depending on which subsampling was used # Subsample from counts? #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # Subsample from GRL z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads) z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads) ### pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-0p05.20101221.pdf",w=6,h=6); boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), col=rep(c("blue","green"),each=length(subs)),boxwex=.04, xlab="Percent of dataset", ylab="Number of TSS regions with adjP<0.05", names=NULL, xaxt="n", main="-1500/+1000bp mrgd UCSC gene TSSs (L2)", ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) axis(1) grid(lty="dashed") legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) dev.off() ############################################################################## print("generate saturation boxplot for FDR<0.01") fdr <- 0.01; ### Set the next two lines depending on which subsampling was used # Subsample from counts? #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # Subsample from GRL z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads) z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads) ### pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-0p01.20101221.pdf",w=6,h=6); boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), col=rep(c("blue","green"),each=length(subs)),boxwex=.04, xlab="Percent of dataset", ylab="Number of TSS regions with adjP<0.01", names=NULL, xaxt="n", main="-1500/+1000bp mrgd UCSC gene TSSs (L2)", ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) axis(1) grid(lty="dashed") legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) dev.off() ############################################################################## print("generate saturation boxplot for FDR<0.001") fdr <- 0.001; ### Set the next two lines depending on which subsampling was used # Subsample from counts? #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # Subsample from GRL z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads) z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads) ### pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-1e-3.20101221.pdf",w=6,h=6); boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), col=rep(c("blue","green"),each=length(subs)),boxwex=.04, xlab="Percent of dataset", ylab="Number of TSS regions with adjP<0.001", names=NULL, xaxt="n", main="-1500/+1000bp mrgd UCSC gene TSSs (L2)", ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) axis(1) grid(lty="dashed") legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) dev.off() ############################################################################## print("generate saturation boxplot for FDR<0.0001") fdr <- 0.0001; ### Set the next two lines depending on which subsampling was used # Subsample from counts? #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # Subsample from GRL z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads) z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads) ### pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-1e-4.20101221.pdf",w=6,h=6); boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), col=rep(c("blue","green"),each=length(subs)),boxwex=.04, xlab="Percent of dataset", ylab="Number of TSS regions with adjP<1e-4", names=NULL, xaxt="n", main="-1500/+1000bp mrgd UCSC gene TSSs (L2)", ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) axis(1) grid(lty="dashed") legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) dev.off() ############################################################################## print("generate saturation boxplot for FDR<1e-6") fdr <- 0.000001; ### Set the next two lines depending on which subsampling was used # Subsample from counts? #z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir) #z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir) # Subsample from GRL z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads) z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads) ### pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-1e-6.20101221.pdf",w=6,h=6); boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02), col=rep(c("blue","green"),each=length(subs)),boxwex=.04, xlab="Percent of dataset", ylab="Number of TSS regions with adjP<1e-6", names=NULL, xaxt="n", main="-1500/+1000bp mrgd UCSC gene TSSs (L2)", ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) ) axis(1) grid(lty="dashed") legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green")) dev.off()