# 01 November 2010 # From Mark Robinson, 18 October 2010 # Ongoing help from Mark... # Run this script by e.g. # source(file="/projects/remc_bigdata/Karsan/R/20101208/bammap2counts_DE-H4ac.20101208.R") # source(file="/projects/remc_bigdata/Karsan/R/20101206/bammap2counts_DE-H4ac.20101206.R") # source(file="/projects/remc_bigdata/Karsan/R/20101102/bammap2counts_DE-H4ac.20101102.R") # puzzle, 31 October 2010: for HS1235 vs HS1238, get 38 reported first (see below) #Counting in HS1235_ctl #Counting in HS1238_kd #Counting successful. #Comparison of groups: HS1238_kd - HS1235_ctl # source("http://bioconductor.org/biocLite.R") # old.packages(repos=biocinstallRepos()) # update.packages(repos=biocinstallRepos(), ask=FALSE) #install.packages("Repitools",repos="http://r-forge.r-project.org",type="source") # biocLite("Rsamtools") 1.2.0 # biocLite("BSgenome.Hsapiens.UCSC.hg18") 834 M as a tar.gz # biocLite("BSgenome.Mmusculus.UCSC.mm8") 747MB # R CMD REMOVE BSgenome.Hsapiens.UCSC.hg18 # R CMD INSTALL /projects/remc_bigdata/Data/BSgenome.Hsapiens.UCSC.hg18_1.3.16.tar.gz # load packages #library(IRanges) #library(Biostrings) #library(GenomicRanges) #library(IRanges) library(Rsamtools) library(Repitools) library(edgeR) library(BSgenome.Hsapiens.UCSC.hg18) print("some parameter settings") # need to add a failed-chastity filter and a MAPQ filter p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) # 1000, 750, 500 bp windows #windowSize <- 1000 windowSize <- 750 #windowSize <- 500 fragSize <- 200 cat("fixed bins:",windowSize,"bp; fragSize:",fragSize,"bp","\n") chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "") #print(paste("w=",windowSize,sep="") #print(paste("fragSize=",fragSize,sep="")) # file names (here, BAM format) --> list of GRanges # HS1238 is 50 bp, HS1235 is 75 bp, both are SE reads # Compare HS1238 then HS1235 readLen <- c(50,75) # vector to match filenames filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam") # OR compare HS1235 then HS1238 #readLen <- c(75,50) # vector to match filenames #filenames <- c("/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam", # "/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam") # A multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, # the second elements, the third elements, and so on. Arguments are recycled if necessary. #GRanges(seqnames=sb$rname, ranges=IRanges(start=sb$pos,width=v), strand=sb$strand) #Add 'chr' to chromosome names print("read in .bam files") gr <- mapply(FUN=function(u,v) { sb <- scanBam(u, param=p)[[1]] GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand) },filenames,readLen) names(gr) <- c("HS1238_kd","HS1235_ctl") #names(gr) <- c("HS1235_ctl","HS1238_kd") # file names (here, MAP format) --> list of GRanges [much slower!] #fnmap <- c("/home/data/NGS/BGI/27-07-2010/BRE01/BRE01_lib1PE122.uniq.map.gz", # "/home/data/NGS/BGI/27-07-2010/BRE02/BRE02_lib2PE122.uniq.map.gz") #grmap <- lapply(fnmap, FUN=function(u) { # x <- readAligned(u, type="Bowtie") # GRanges(seqnames=chromosome(x), ranges=IRanges(start=position(x),width=readLen), # strand=strand(x)) #}) #grl <- GRangesList(grmap) # use this for MAP files grl <- GRangesList(gr) print("create table of windows along the genome") windows <- genomeBlocks(Hsapiens, chrNames, windowSize) head(windows) #RangedData with 6 rows and 0 value columns across 24 spaces # space ranges | # | #1 chr1 [ 1, 1000] | #2 chr1 [1001, 2000] | #3 chr1 [2001, 3000] | #4 chr1 [3001, 4000] | #5 chr1 [4001, 5000] | #6 chr1 [5001, 6000] | print("create count table after extending reads to be length fragSize") counts <- annotationBlocksCounts(grl, windows, fragSize, verbose=TRUE) # MR: If colSums are 0, we'll have to fix the chromosome names problem. colSums(counts) # HS1238_kd HS1235_ctl # 19563732 26082144 print("filter on the total count for a bin (10)") k <- rowSums(counts) > 10 print("do an edgeR DE analysis of counts") f <- calcNormFactors(counts[k,]) ############################################################################## # Mark Robinson, 08 December 2010 print("estimate common dispersion as 'dhack'") dhack <- d <- DGEList(counts=counts[k,], group=colnames(counts), lib.size=colSums(counts[k,])*f, genes=as.data.frame(windows)[k,1:3]) dhack$samples$group <- factor(c("A","A")) dhack <- estimateCommonDisp(dhackTss) d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning d$common.dispersion <- dhack$common.dispersion cat(windowSize,"bp bins: dhack$common.dispersion=",dhack$common.dispersion,"\n") ############################################################################## # Write out a smear plot PDF #pdf("/projects/remc_bigdata/Karsan/R/20101101/DE-H4ac.1000bp-bins.edgeR.20101101.smearPlot.pdf") #pdf("/projects/remc_bigdata/Karsan/R/20101101/DE-H4ac.500bp-bins.edgeR.20101101.smearPlot.pdf") #plotSmear(d) #grid() #dev.off() #Warning message: #In estimateCommonDisp(d) : # There is no replication. Setting common dispersion to 0. print("do exactTest(d)") #de <- exactTest(d,pair=c("HS1238_kd","HS1235_ctl")) # Comparison of groups: HS1235_ctl - HS1238_kd de <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd")) # Comparison of groups: HS1238_kd - HS1235_ctl - makes pos-FC=1238-enriched #> de #An object of class "deDGEList" #$table # logConc logFC p.value #1 -17.61765 1.02039588 2.954411e-06 #2 -20.89289 0.24495614 1.000000e+00 #3 -20.88492 -0.49612556 5.034447e-01 #4 -19.78170 0.32656991 5.327093e-01 #5 -20.39289 0.07503114 1.000000e+00 #1511146 more rows ... topTags(de) #> topTags(de) #Comparison of groups: HS1238_kd - HS1235_ctl # logConc logFC PValue FDR #2677274 -10.863195 -0.7603573 9.500483e-294 1.435666e-287 #121187 -8.015492 -0.2615755 1.802020e-252 1.361563e-246 #1294341 -9.974019 -0.4176546 6.519279e-166 3.283871e-160 #1606831 -12.850526 -1.0059503 2.006271e-130 7.579445e-125 #1604542 -12.880627 -0.9892234 1.805570e-123 5.456979e-118 #1293822 -14.726969 -1.6940994 4.573325e-100 1.151831e-94 #1719803 -15.587381 2.1549864 9.805623e-95 2.116825e-89 #2012668 -18.029993 -4.9018324 1.904948e-89 3.598330e-84 #2677275 -10.435734 -0.3546701 6.543402e-88 1.098674e-82 #1120720 -11.430976 -0.4916045 1.256872e-84 1.899324e-79 #print("table with genome locations, counts, statistics") #xx <- cbind(as.data.frame(windows)[k,-5], counts[k,], de$table) # Mark (31 Oct 2010) - add an B&H adjusted p-val to the output xx <- cbind(as.data.frame(windows)[k,-5], counts[k,], de$table,adjp=p.adjust(de$table$p.value, method="BH") ) # Check the table table(xx$adjp < .05, xx$logFC > 0) # FALSE TRUE # FALSE 763844 733655 # TRUE 6696 6956 #w <- xx$p.value < .00001 # these are not correct pvals, so you may want to be harsher than this. # Get low-adjp records #w <- xx$adjp < 0.05 w <- xx$p.value < 0.1 xxx <- xx[w,] print("write.tablem all records") #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.bins-w1000.edgeR.dispersion.all.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) write.table(xx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.bins-w750.edgeR.dispersion.all.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.bins-w500.edgeR.dispersion.all.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) print("write.table, records with p-val < 0.1") #write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.bins-w1000.edgeR.dispersion.p0.1.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.bins-w750.edgeR.dispersion.p0.1.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.bins-w500.edgeR.dispersion.p0.1.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) # with 'adjp' # 1 2 3 4 5 6 7 8 9 10 #seqnames start end width HS1238_kd HS1235_ctl logConc logFC p.value adjp #chr1 1 1000 1000 114 74 -17.6176485910214 1.02039588397656 2.95441057191974e-06 0.00171401942252625 # awk '{ if($10 <= 0.05) { print $0 > "HS1238-vs-1235.w1000.edgeR.adjp-le-0.05.txt" } }' HS1238_vs_1235.w1000.edgeR-pairs-adjp.txt #[grobertson@xhost09 20101031]$ wc -l HS1238_vs_1235.w1000.edgeR-pairs-adjp.txt # 1,511,152 HS1238_vs_1235.w1000.edgeR-pairs-adjp.txt #[grobertson@xhost09 20101031]$ wc -l HS1238-vs-1235.w1000.edgeR.adjp-le-0.05.txt # 13,652 HS1238-vs-1235.w1000.edgeR.adjp-le-0.05 #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101031/HS1235_vs_1238.w1000.edgeR.txt", sep="\t", row.names=FALSE, quote=FALSE) #seqnames start end width HS1238_kd HS1235_ctl logConc logFC p.value #chr1 1 1000 1000 114 74 -17.6176485910214 1.02039588397656 2.95441057191974e-06 #chr1 1001 2000 1000 9 10 -20.8928937327534 0.244956141995718 0.999999999999992 #chr1 2001 3000 1000 7 13 -20.8849229608189 -0.49612556064272 0.503444671630861 #chr1 3001 4000 1000 20 21 -19.7816975220852 0.32656990754937 0.532709255236125 #chr1 4001 5000 1000 12 15 -20.3928937327534 0.0750311405534056 0.999999999999987 #chr1 5001 6000 1000 10 12 -20.6853749831140 0.133924829606974 0.999999999999999 #chr1 6001 7000 1000 14 16 -20.2351428198894 0.204314157498372 0.85553555190565 #chr1 7001 8000 1000 14 22 -20.0054270105708 -0.255117461138925 0.617719317029688 #chr1 8001 9000 1000 15 11 -20.4556591737953 0.84441821241199 0.122078120708466 #chr1 9001 10000 1000 11 12 -20.616623221239 0.271428353356909 0.677639484405523 #chr1 10001 11000 1000 15 15 -20.2319296853097 0.396959235440768 0.47312965989113 # find differential expression de<-exactTest(d) # highlighting the top 500 most DE tags de.tags <- rownames(topTags(de, n=500)$table) #pdf(file="/projects/remc_bigdata/Karsan/R/20101022/HS1238_vs_1235.w750.edgeR.plotSmear.pdf") #plotSmear(d) #de.tags <- rownames(topTags(de,n=500)$table) #plotSmear(d,de.tags=de.tags) #dev.off() # 21 Oct 2010 - For w=1000, uses a max of ~5.6G for Iva's data. # "/projects/remc_bigdata/Karsan/R/20101022/bammap2counts_DE.H4ac.R" #awk '{if($9<=0.000001) {print $0 > "HS1235_vs_1238.w500.edgeR.p1e-6.txt"}}' HS1235_vs_1238.w500.edgeR.txt ############### #print("now do the opposite DE calculation") #de1 <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd")) # Comparison of groups: HS1238_kd - HS1235_ctl #topTags(de) #xx1 <- cbind(as.data.frame(windows)[k,-5], counts[k,], de1$table) #w1 <- xx$p.value < 0.1 #xxx1 <- xx1[w,] #write.table(xx1, "/projects/remc_bigdata/Karsan/R/20101128_TSS/HS1235_vs_1238.w1000.edgeR.p0.1.20101102.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xx1, "/projects/remc_bigdata/Karsan/R/20101102/HS1235_vs_1238.w500.edgeR.p0.1.20101102.txt", sep="\t", row.names=FALSE, quote=FALSE) ############################################################################## ### 31 October 2010, Mark Robinson ########################################### cat("\n","-------------------------------------------","\n") print("Now do differentially enriched TSS regions") library(rtracklayer) session <- browserSession("UCSC") genome(session) <- "hg18" #trackNames(session) ## list the track names q <- ucscTableQuery(session, "knownGene") knownGene <- getTable(q) anno <- data.frame(chr=knownGene$chrom, strand=knownGene$strand, start=knownGene$txStart, end=knownGene$txEnd, name=knownGene$name) print("TSS regions: get counts") #cat("bpUp=1500, bpDown=1000", "\n") #counts <- annotationCounts(grl, anno, bpUp=1500, bpDown=1000, seqLen=fragSize, verbose=TRUE) cat("bpUp=2000, bpDown=1000", "\n") counts <- annotationCounts(grl, anno, bpUp=2000, bpDown=1000, seqLen=fragSize, verbose=TRUE) #cat("bpUp=2500, bpDown=1500", "\n") #counts <- annotationCounts(grl, anno, bpUp=2500, bpDown=1500, seqLen=fragSize, verbose=TRUE) head(counts) # HS1238_kd HS1235_ctl #uc001aaa.2 124 83 #uc009vip.1 124 83 #uc009vis.1 28 36 #uc001aag.1 29 37 #uc009vjc.1 28 33 #uc009vjd.1 30 34 colSums(counts) #HS1238_kd HS1235_ctl # 3070716 3288149 k <- rowSums(counts) > 10 f <- calcNormFactors(counts[k,]) print("estimate common dispersion as 'dhack'") dhack <- d <- DGEList(counts=counts[k,], group=colnames(counts), lib.size=colSums(counts[k,])*f, genes=anno[k,1:3]) #genes=as.data.frame(windows)[k,1:3]) dhack$samples$group <- factor(c("A","A")) dhack <- estimateCommonDisp(dhack) d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning d$common.dispersion <- dhack$common.dispersion cat("TSSrgns: dhack$common.dispersion=",dhack$common.dispersion,"\n") ########################################################## # Old way, without 'dhack' #d <- DGEList(counts=counts[k,], # group=colnames(counts), # lib.size=colSums(counts[k,])*f, # genes=anno[k,1:3]) #d <- estimateCommonDisp(d) # END: Old way, without 'dhack' ########################################################## #print("Write out a smear plot PDF") #pdf("/projects/remc_bigdata/Karsan/R/20101208/DE-H4ac.knownGene-TSS-1.5to1kb.edgeR.20101208.smearPlot.pdf") #plotSmear(d) #grid() #dev.off() print("exactTest()") #de <- exactTest(d,pair=c("HS1238_kd","HS1235_ctl")) de <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd")) topTags(de) #Comparison of groups: HS1235_ctl - HS1238_kd # logConc logFC PValue FDR #uc010jxm.1 -14.28035 -2.793026 1.466677e-67 9.151919e-63 #uc003oqf.1 -15.77093 -3.076399 1.000602e-29 2.081220e-25 #uc003oqg.1 -15.77093 -3.076399 1.000602e-29 2.081220e-25 #uc003oyt.1 -13.68057 1.350440 1.744515e-24 2.177120e-20 #uc003oyu.1 -13.68057 1.350440 1.744515e-24 2.177120e-20 #uc009ywd.1 -15.55934 2.571668 1.155206e-23 1.201395e-19 #uc003pfy.1 -14.07025 1.522238 1.781377e-23 1.587945e-19 #uc009ywe.1 -15.43874 2.399982 1.447931e-22 1.129368e-18 #uc001peb.2 -15.77869 2.614995 6.824393e-22 4.731503e-18 #uc009ywf.1 -15.28606 2.179259 6.484478e-21 4.046250e-17 # Add gene symbol (31 October 2010) # MR: Most direct, you could just add an extra column to your table: #Alternatively, when you call the DGEList() constructor, you can specific the #'genes' element to carry along the annotation information all the way to the exactTest(). See ?DGEList ... xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table,adjp=p.adjust(de$table$p.value, method="BH") ) #xxtss <- cbind(anno[k,-5], counts[k,], detss$table,adjp=p.adjust(detss$table$p.value, method="BH") ) print("write.table, all records, regardless of p-val") #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-1.5to1kb.edgeR.dispersion.all.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) write.table(xx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-2to1kb.edgeR.dispersion.all.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-2.5to1.5kb.edgeR.dispersion.all.20101208.txt", sep="\t", row.names=FALSE, quote=FALSE) print("write.table, records with p-val < 0.1") w <- xx$p.value < 0.1 xxx <- xx[w,] print("write.table(), only records with pval<0.1") #write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-1.5to1kb.edgeR.dispersion.pval-LT-0.1.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-2to1kb.edgeR.dispersion.pval-LT-0.1.20101208a.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101208/HS1238_vs_1235.knownGeneTSS-2.5to1.5kb.edgeR.dispersion.pval-LT-0.1.20101208.txt", sep="\t", row.names=FALSE, quote=FALSE) ############################ #print("Now reverse the order") #print("exactTest()") #de <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd")) #topTags(de) #Comparison of groups: HS1235_ctl - HS1238_kd # logConc logFC PValue FDR #uc010jxm.1 -14.28035 -2.793026 1.466677e-67 9.151919e-63 #uc003oqf.1 -15.77093 -3.076399 1.000602e-29 2.081220e-25 #uc003oqg.1 -15.77093 -3.076399 1.000602e-29 2.081220e-25 #uc003oyt.1 -13.68057 1.350440 1.744515e-24 2.177120e-20 #uc003oyu.1 -13.68057 1.350440 1.744515e-24 2.177120e-20 #uc009ywd.1 -15.55934 2.571668 1.155206e-23 1.201395e-19 #uc003pfy.1 -14.07025 1.522238 1.781377e-23 1.587945e-19 #uc009ywe.1 -15.43874 2.399982 1.447931e-22 1.129368e-18 #uc001peb.2 -15.77869 2.614995 6.824393e-22 4.731503e-18 #uc009ywf.1 -15.28606 2.179259 6.484478e-21 4.046250e-17 # Add gene symbol (31 October 2010) # MR: Most direct, you could just add an extra column to your table: #Alternatively, when you call the DGEList() constructor, you can specific the #'genes' element to carry along the annotation information all the way to the exactTest(). See ?DGEList ... #xx <- cbind(anno[k,-5], counts[k,], id=rownames(de$table), de$table,adjp=p.adjust(de$table$p.value, method="BH") ) #xxtss <- cbind(anno[k,-5], counts[k,], detss$table,adjp=p.adjust(detss$table$p.value, method="BH") ) #w <- xx$p.value < 0.1 #xxx <- xx[w1,] #print("write.table()") #write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101101/HS1235_vs_1238.knownGeneTSS-1500-1000.edgeR.p0.1.20101101.txt", sep="\t", row.names=FALSE, quote=FALSE) print("done.")