# 10 Jan 2011, 08h30
#
# Have used knownGene many times. Now try to use 'wgRna' table for miRNAs
#
# Code from Mark Robinson, 13, 16 Dec 2010
# source("/projects/remc_bigdata/Karsan/R/20110110/edgeR_TSS-rgns_DE-H4ac.20110110.miRNA.R")

#install.packages("/home/grobertson/linux_x86_64/R/Rsamtools_1.2.2.tar.gz",type="source")
#install.packages("/home/grobertson/linux_x86_64/R/IRanges_1.8.7.tar.gz",type="source")


##############################################################################
options(stringsAsFactors=FALSE)

library(rtracklayer)
session <- browserSession("UCSC")
genome(session) <- "hg18"

#trackNames(session) ## list the track names

# KnownGene used many times, but it does not include miRNA!
qKG <- ucscTableQuery(session, "knownGene")
cat("getting UCSC knownGene table:",date(),"\n")
knownGene <- getTable(qKG)

# NEW - 09 Jan 2011
qMiRNA <- ucscTableQuery(session, "wgRna")
cat("getting UCSC 'wgRna' table:",date(),"\n")
miRNA <- getTable(qMiRNA)
cat("done",date(),"\n")

# merge TSSs with identical starts
knownGene$position <- ifelse(knownGene$strand=="+", knownGene$txStart, knownGene$txEnd)
# wgRna table: {bin, chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, type}
miRNA$position <- ifelse(miRNA$strand=="+", miRNA$chromStart, miRNA$chromEnd)

# Choose which to work with: knownGene or miRNA
keyKg <- paste(knownGene$chrom, knownGene$position, sep=":")
ukeyKg <- unique(keyKg)
mKg <- match(ukeyKg, keyKg)
mkgKg <- knownGene[mKg,c("chrom","strand","txStart","txEnd","name","position")]  # mkg = merged known genes
#
keyMiR <- paste(miRNA$chrom, miRNA$position, sep=":")
ukeyMiR <- unique(keyMiR)
mMiR <- match(ukeyMiR, keyMiR)
mkgMiR <- miRNA[mMiR,c("chrom","strand","chromStart","chromEnd","name","position")]  # mkg = merged miRNA genes

# create merged IDs
rownames(mkgKg) <- ukeyKg
rownames(mkgMiR) <- ukeyMiR

# Choose which to work with: knownGene or miRNA
idsKg <- split(knownGene$name, keyKg)
collapsedidsKg <- sapply(idsKg, paste, collapse=",")
mkgKg$newID <- ""
mkgKg[names(collapsedidsKg),"newID"] <- collapsedidsKg

idsMiR <- split(miRNA$name, keyMiR)
collapsedidsMiR <- sapply(idsMiR, paste, collapse=",")
mkgMiR$newID <- ""
mkgMiR[names(collapsedidsMiR),"newID"] <- collapsedidsMiR


# merge nearby TSSs 
tssTol <- 500  # if distance b/w TSSs is less than this, they are merged
splitTol <- 250000

lockey <- paste(mkg$chrom, mkg$strand, sep=":")
ind <- seq_len(nrow(mkg))
inds <- split(ind, lockey)

clustno <- lapply(inds, FUN=function(u) {
  w <- diff(mkg$position[u]) > splitTol
  z <- cumsum(c(0,w))
  s <- split(u, z)
  cat(".")
  clusts <- lapply(s, FUN=function(uu) {
     if(length(uu)==1) return(1)
     d <- dist(mkg$position[uu])
     h <- hclust(d,"ave")
     cutree(h,h=tssTol)
  })
  paste( rep(names(s), sapply(s,length)), unlist(clusts, use.names=FALSE), sep=".")
})

clustno <- unsplit( clustno, lockey )
clustkey <- paste( mkg$chrom, mkg$strand, clustno, sep=":" )

uckey <- unique(clustkey)
mc <- match(uckey, clustkey)
mkg1 <- mkg[mc,]

rownames(mkg1) <- uckey
ids <- split(mkg$newID, clustkey)
collapsedids <- sapply(ids, paste, collapse=";")
mkg1$newID <- ""
mkg1[names(collapsedids),"newID"] <- collapsedids

rownames(mkg1) <- NULL

# Choose which to work with: knownGene or miRNA
#anno <- data.frame(chr=mkg1$chrom, strand=mkg1$strand, start=mkg1$txStart, end=mkg1$txEnd, name=mkg1$name, allIDs=mkg1$newID)
anno <- data.frame(chr=mkg1$chrom, strand=mkg1$strand, start=mkg1$chromStart, end=mkg1$chromEnd, name=mkg1$name, allIDs=mkg1$newID)
# generate unique 'name' column
anno$name <- paste(anno$name, anno$chr, anno$start, sep=":")

write.table(anno, "/projects/remc_bigdata/Karsan/R/20110109/wgRna.merged-TSSs-d500.20110110.txt", sep="\t", row.names=FALSE, quote=FALSE)

cat("\nTSS's are merged:",date(),"\n")

#quit()


##############################################################################
cat("read in BAM files\n")
library(Rsamtools)
library(Repitools)
library(edgeR)
library(BSgenome.Hsapiens.UCSC.hg18)

cat("set scanBam parameters\n")
# need to add a failed-chastity filter and a MAPQ filter
#p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE))
# 16 Dec 2010, filter by MAPQ
p <- ScanBamParam(what=c("rname", "strand", "pos", "mapq"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE))

chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "")

fragSize <- 200

##############################################################################
### set below to select lane-1 or lane-2
##############################################################################
readLen <- c(50,75)  # vector to match filenames
filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam",
               "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam")
##############################################################################
# 16 Dec 2010
#readLen <- c(50,75,50,50)  # vector to match filenames
#filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam",
#               "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam",
#               "/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam",
#               "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam")
##############################################################################
# 18 Dec 2010 - just 'lane-2' data
#readLen <- c(50,50)  # vector to match filenames
#filenames <- c("/projects/analysis/analysis5/HS1238/62P7NAAXX_2/bwa/62P7NAAXX_2.bam",
#               "/projects/analysis/analysis5/HS1235/62P7NAAXX_1/bwa/62P7NAAXX_1.bam")

##############################################################################
# 16 Dec 2010: old way without filtering by MAPQ
#gr <- mapply(FUN=function(u,v) {
#  sb <- scanBam(u, param=p)[[1]]
#  GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand)
#  },filenames,readLen)
#names(gr) <- c("HS1238_kd","HS1235_ctl","HS1238_kd_2","HS1235_ctl_2")
#names(gr) <- c("HS1238_kd_2","HS1235_ctl_2")

##############################################################################
# 16 Dec, Mark Robinson: add MAPQ filter here
# 18 Dec 2010 - use for lane-2 data
cat("now read in .bam files \n")
gr <- mapply(FUN=function(u,v) {
  sb <- scanBam(u, param=p)[[1]]
  w <- sb$mapq > 10  # SET CUTOFF HERE
  cat(u, ": Read", length(sb$rname), "mapped non-dup. reads,", round(mean(w)*100,2), "percent with MAPQ > 10\n")
  GRanges(seqnames=paste("chr",sb$rname[w],sep=""), ranges=IRanges(start=sb$pos[w],width=v), strand=sb$strand[w])
  },filenames,readLen)

### set this to select lane-1 or lane-2
names(gr) <- c("HS1238_kd","HS1235_ctl")
#names(gr) <- c("HS1238_kd","HS1235_ctl","HS1238_kd_2","HS1235_ctl_2")
#names(gr) <- c("HS1238_kd_2","HS1235_ctl_2")
##############################################################################
grl <- GRangesList(gr)


#cat("TSS regions: get counts\n")
#cat("bpUp=1500, bpDown=1000", "\n")
#counts <- annotationCounts(grl, anno, bpUp=1500, bpDown=1000, seqLen=fragSize, verbose=TRUE)
cat("bpUp=2000, bpDown=1000", "\n")
counts <- annotationCounts(grl, anno, bpUp=2000, bpDown=1000, seqLen=fragSize, verbose=TRUE)
#cat("bpUp=2500, bpDown=1000", "\n")
#counts <- annotationCounts(grl, anno, bpUp=2500, bpDown=1000, seqLen=fragSize, verbose=TRUE)
#cat("bpUp=2500, bpDown=1500", "\n")
#counts <- annotationCounts(grl, anno, bpUp=2500, bpDown=1500, seqLen=fragSize, verbose=TRUE)

colSums(counts)
#HS1238_kd HS1235_ctl 
#  3070716    3288149
k <- rowSums(counts) > 10 
nf <- calcNormFactors(counts[k,])


##############################################################################
#print("estimate common dispersion as 'dhack'")
#dhack <- d <- DGEList(counts=counts[k,], 
#	         group=colnames(counts), 
#             lib.size=colSums(counts[k,])*f,
#             genes=anno[k,1:3])
#             #genes=as.data.frame(windows)[k,1:3])
#dhack$samples$group <- factor(c("A","A"))
#dhack <- estimateCommonDisp(dhack)
#d <- estimateCommonDisp(d)  # in my example, no bio replicates, will get warning
#d$common.dispersion <- dhack$common.dispersion
#cat("TSSrgns: dhack$common.dispersion=",dhack$common.dispersion,"\n")


##############################################################################
# Old way, without 'dhack', for a single run pair
# 18 Dec 2010 - resusing this for lane-2 data
d <- DGEList(counts=counts[k,], 
            group=colnames(counts), 
            lib.size=colSums(counts[k,])*nf,
            genes=anno[k,1:3])
d <- estimateCommonDisp(d)
cat("d$common.dispersion=",d$common.dispersion,"\n")

print("exactTest()")
de <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd"))
#de <- exactTest(d,pair=c("HS1235_ctl_2","HS1238_kd_2"))
#topTags(de)

xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table, adjp=p.adjust(de$table$p.value, method="BH") )

print("write.table, all records, regardless of p-val")
#write.table(xx, "/projects/remc_bigdata/Karsan/R/20101226/HS1238_vs_1235.lane-1.mapq10-noDups.knownGene.merge-500.2p5kb-TSS-1p5kb.mapq10_noDups.edgeR.all.20101218.txt", sep="\t", row.names=FALSE, quote=FALSE)
write.table(xx, "/projects/remc_bigdata/Karsan/R/20110109/HS1238_vs_1235.lane-1.mapq10-noDups.wgRna.merged-TSSs-d500.2kb-TSS-1kb.edgeR.all.20110109.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(xx, "/projects/remc_bigdata/Karsan/R/20101226/HS1238_vs_1235.lane-1.mapq10-noDups.knownGene.merged-TSSs-d500.2kb-TSS-1kb.mapq10_noDups.edgeR.all.20101226.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(xx, "/projects/remc_bigdata/Karsan/R/20101221/HS1238_vs_1235.lane-1.mapq10-noDups.knownGene.merge-500.1p5-TSS-1kb.edgeR.all.20101221a.txt", sep="\t", row.names=FALSE, quote=FALSE)
#write.table(xx, "/projects/remc_bigdata/Karsan/R/20101218/HS1238_vs_1235.lane-1.mapq10-noDups.knownGene.merge-500.2kb-TSS-1kb.mapq10_noDups.edgeR.all.20101218.txt", sep="\t", row.names=FALSE, quote=FALSE)

#print(date())

#cat("save.image()\n")

#save.image("Robjects.2p5kb-TSS-1p5kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file
#save.image("Robjects.2p5kb-TSS-1kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file
#save.image("Robjects.2kb-TSS-1kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file
#save.image("Robjects.1.5kb-TSS-1kb.2010-12-18-Vancouver.Rdata") # save all objects in a single file

quit()


##############################################################################
##############################################################################
downSamp <- function(d, fr=.5) {
  g <- rownames(d)
  newd <- matrix(0,nr=nrow(d), nc=ncol(d), dimnames=dimnames(d))
  for(j in 1:ncol(d)) {
    cat(".")
    s <- sample( rep(g,d[,j]), round(fr*sum(d[,j])) )
    #s <- sample(g, f*sum(d[,j]), prob=d[,j], replace=TRUE)
    ts <- table(s)
    newd[names(ts),j] <- ts
  }
  cat("\n")
  newd
}

print("do subsampling analysis (downsampling counts).")
nSamps <- 10              # choose number of samples to take at each proportion
subs <- seq( 0.3, 1, by=0.05 )  # choose resolution here
names(subs) <- subs

#dir <- nDE <- lapply(subs, function(u) matrix(0,nrow=nrow(d$counts),ncol=nSamps))

#print(date())
#for(i in 1:length(subs)) {
#  for(j in 1:nSamps) {
#    DD <- downSamp(d$counts, subs[i])
#    nfSub <- calcNormFactors(DD)  # NEW
#    dSub <- DGEList(counts=DD, group=colnames(counts), lib.size=colSums(DD)*nfSub) # NEW
#    #dSub <- DGEList(counts=DD, group=colnames(counts), lib.size=colSums(DD)*nf) # OLD
#    dSub <- estimateCommonDisp(dSub)
#    ### set this to select lane-1 or lane-2
#    deSub <- exactTest(dSub,pair=c("HS1235_ctl","HS1238_kd"))       # lane-1
#    #deSub <- exactTest(dSub,pair=c("HS1235_ctl_2","HS1238_kd_2"))  # lane-2
#    ###
#    nDE[[i]][,j] <- deSub$table$p.value
#    dir[[i]][,j] <- sign(deSub$table$logFC)
#  }
#  cat("------ Finished downsampling counts at", subs[i],"----------------------\n")
#}
#print(date())


##############
#print("generate saturation boxplot for FDR<0.05")
#fdr <- 0.05;
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
#
#pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LE-0p05.20101221.pdf",w=6,h=6);
#boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
#        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
#        xlab="Percent of dataset",
#        ylab="Number of TSS regions with adjP<0.05",
#        names=NULL, xaxt="n",
#        main="-1500/+1000bp mrgd UCSC gene TSSs (L1)",
#        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
#axis(1)
#grid(lty="dashed")
#legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
#dev.off()
#
###############
#print("generate saturation boxplot for FDR<0.01")
#fdr <- 0.01;
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
#
#pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LE-0p01.20101221.pdf",w=6,h=6);
#boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
#        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
#        xlab="Percent of dataset",
#        ylab="Number of TSS regions with adjP<0.01", 
#        names=NULL, xaxt="n",
#        main="-1500/+1000bp mrgd UCSC gene TSSs (L1)",
#        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
#axis(1)
#grid(lty="dashed")
#legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
#dev.off()
#
###############
#print("generate saturation boxplot for FDR<1e-3")
#fdr <- 0.001;
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
#
#pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LT-1e-3.20101221.pdf",w=6,h=6);
#boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
#        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
#        xlab="Percent of dataset",
#        ylab="Number of TSS regions with adjP<0.001",
#        names=NULL,xaxt="n",
#        main="-1500/+1000bp mrgd UCSC gene TSS's (L1)",
#        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
#axis(1)
#grid(lty="dashed")
#legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
#dev.off()
#
###############
#print("generate saturation boxplot for FDR<1e-4")
#fdr <- 0.0001;
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
#
#pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LT-1e-4.20101221.pdf",w=6,h=6);
#boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
#        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
#        xlab="Percent of dataset",
#        ylab="Number of TSS regions with adjP<1e-4",
#        names=NULL,xaxt="n",
#        main="-1500/+1000bp mrgd UCSC gene TSSs (L1)",
#        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
#axis(1)
#grid(lty="dashed")
#legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
#dev.off()
#
###############
#print("generate saturation boxplot for FDR<1e-6")
#fdr <- 0.000001;
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
#
#pdf("/projects/remc_bigdata/Karsan/R/20101221/lane1.downSampling-counts.1p5-mrgdTSS-1kb.adjP-LT-1e-6.20101221.pdf",w=6,h=6);
#boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
#        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
#        xlab="Percent of dataset",
#        ylab="Number of TSS regions with adjP<1e-6",
#        names=NULL,xaxt="n",
#        main="-1500/+1000bp mrgd UCSC gene TSSs (L1)",
#        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
#axis(1)
#grid(lty="dashed")
#legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
#dev.off()


##############################################################################
##############################################################################
# Mark, 20 Dec 2010 - much slower, downsamples from GRL
downSampGRL <- function(grl, fr=.05) {
  grlSub <- grl
  ns <- elementLengths(grl)
  for(j in 1:length(grlSub)) {
   s <- sort(sample( ns[j], round(fr*ns[j]) ))
   grlSub[[j]] <- grlSub[[j]][s]
  }
  # This is for TSS regions
  annotationCounts(grlSub, anno, bpUp=1500, bpDown=1000, seqLen=fragSize, verbose=TRUE)
}


### Set up loops ########################################
print("do subsampling analysis (downsampling GRL).")
nSamps <- 5              # choose number of samples to take at each proportion
subs <- seq( 0.3, 1, by=0.1 )  # choose resolution here
#subs <- seq( 0.5, 0.7, by=0.2 )  # choose resolution here
names(subs) <- subs
### END: Set up loops  ##################################

dir.reads <- nDE.reads <- lapply( subs, function(u) matrix(0,nrow=nrow(anno),ncol=nSamps) )

print(date())
for(i in 1:length(subs)) {
  for(j in 1:nSamps) {
    ################################
    cat("before downSampGRL ...")
    DD <- downSampGRL(grl, subs[i])
    cat("after downSampGRL \n")
    ################################
    nfSub <- calcNormFactors(DD)
    dSub <- DGEList( counts=DD, group=colnames(counts), lib.size=colSums(DD)*nfSub )
    dSub <- estimateCommonDisp(dSub)
    ### Need to set the next line for the right pair of data sets
    deSub <- exactTest(dSub,pair=c("HS1235_ctl","HS1238_kd"))
    #deSub <- exactTest( dSub, pair=c("HS1235_ctl_2","HS1238_kd_2") )
    ###
    nDE.reads[[i]][,j] <- deSub$table$p.value
    dir.reads[[i]][,j] <- sign(deSub$table$logFC)
  }
  cat("------ Finished downsampling reads at", subs[i],"----------------------\n")
}
print(date())


##############################################################################
print("generate saturation boxplot for FDR<0.05")
fdr <- 0.05;
### Set the next two lines depending on which subsampling was used
# Subsample from counts?
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
# Subsample from GRL
z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads)
z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads)
###

pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-0p05.20101221.pdf",w=6,h=6);
boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
        xlab="Percent of dataset",
        ylab="Number of TSS regions with adjP<0.05",
        names=NULL, xaxt="n",
        main="-1500/+1000bp mrgd UCSC gene TSSs (L2)",
        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
axis(1)
grid(lty="dashed")
legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
dev.off()

##############################################################################
print("generate saturation boxplot for FDR<0.01")
fdr <- 0.01;
### Set the next two lines depending on which subsampling was used
# Subsample from counts?
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
# Subsample from GRL
z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads)
z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads)
###

pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-0p01.20101221.pdf",w=6,h=6);
boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
        xlab="Percent of dataset",
        ylab="Number of TSS regions with adjP<0.01",
        names=NULL, xaxt="n",
        main="-1500/+1000bp mrgd UCSC gene TSSs (L2)",
        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
axis(1)
grid(lty="dashed")
legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
dev.off()

##############################################################################
print("generate saturation boxplot for FDR<0.001")
fdr <- 0.001;
### Set the next two lines depending on which subsampling was used
# Subsample from counts?
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
# Subsample from GRL
z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads)
z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads)
###

pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-1e-3.20101221.pdf",w=6,h=6);
boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
        xlab="Percent of dataset",
        ylab="Number of TSS regions with adjP<0.001",
        names=NULL, xaxt="n",
        main="-1500/+1000bp mrgd UCSC gene TSSs (L2)",
        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
axis(1)
grid(lty="dashed")
legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
dev.off()

##############################################################################
print("generate saturation boxplot for FDR<0.0001")
fdr <- 0.0001;
### Set the next two lines depending on which subsampling was used
# Subsample from counts?
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
# Subsample from GRL
z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads)
z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads)
###

pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-1e-4.20101221.pdf",w=6,h=6);
boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
        xlab="Percent of dataset",
        ylab="Number of TSS regions with adjP<1e-4",
        names=NULL, xaxt="n",
        main="-1500/+1000bp mrgd UCSC gene TSSs (L2)",
        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
axis(1)
grid(lty="dashed")
legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
dev.off()

##############################################################################
print("generate saturation boxplot for FDR<1e-6")
fdr <- 0.000001;
### Set the next two lines depending on which subsampling was used
# Subsample from counts?
#z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE, dir)
#z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE, dir)
# Subsample from GRL
z.KD <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v > 0), nDE.reads, dir.reads)
z.CTL <- mapply(function(u,v) colSums(apply(u,2,p.adjust, method="BH") < fdr & v < 0), nDE.reads, dir.reads)
###

pdf("/projects/remc_bigdata/Karsan/R/20101221/lane-1.downSampling-reads.1p5-mrgdTSS-1kb.adjP-LE-1e-6.20101221.pdf",w=6,h=6);
boxplot(data.frame(z.KD,z.CTL), at=c(subs-.02,subs+.02),
        col=rep(c("blue","green"),each=length(subs)),boxwex=.04,
        xlab="Percent of dataset",
        ylab="Number of TSS regions with adjP<1e-6",
        names=NULL, xaxt="n",
        main="-1500/+1000bp mrgd UCSC gene TSSs (L2)",
        ylim=range(c(z.KD,z.CTL)),xlim=extendrange(range(subs)) )
axis(1)
grid(lty="dashed")
legend("topleft",c("KD","CTL"),pch=15,col=c("blue","green"))
dev.off()