# 31 October 2010 # From Mark Robinson, 18 October 2010 # Ongoing help from Mark... # Run this script by e.g. # source(file="/projects/remc_bigdata/Karsan/R/20101031/bammap2counts_DE.H4ac.R") # puzzle, 31 October 2010: for HS1235 vs HS1238, get 38 reported first (see below) #Counting in HS1235_ctl #Counting in HS1238_kd #Counting successful. #Comparison of groups: HS1238_kd - HS1235_ctl # source("http://bioconductor.org/biocLite.R") # old.packages(repos=biocinstallRepos()) # update.packages(repos=biocinstallRepos(), ask=FALSE) #install.packages("Repitools",repos="http://r-forge.r-project.org",type="source") # biocLite("Rsamtools") 1.2.0 # biocLite("BSgenome.Hsapiens.UCSC.hg18") 834 M as a tar.gz # biocLite("BSgenome.Mmusculus.UCSC.mm8") 747MB # R CMD REMOVE BSgenome.Hsapiens.UCSC.hg18 # R CMD INSTALL /projects/remc_bigdata/Data/BSgenome.Hsapiens.UCSC.hg18_1.3.16.tar.gz # load packages #library(IRanges) #library(Biostrings) #library(GenomicRanges) #library(IRanges) library(Rsamtools) library(Repitools) library(edgeR) library(BSgenome.Hsapiens.UCSC.hg18) paste("some parameter settings") # need to add a failed-chastity filter and a MAPQ filter p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) # 1000, 750, 500 bp windows windowSize <- 1000 fragSize <- 200 chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "") paste("w=",windowSize) paste("fragSize=",fragSize) # file names (here, BAM format) --> list of GRanges # HS1238 is 50 bp, HS1235 is 75 bp, both are SE reads # Compare HS1238 then HS1235 readLen <- c(50,75) # vector to match filenames filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam") # OR compare HS1235 then HS1238 #readLen <- c(75,50) # vector to match filenames #filenames <- c("/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam", # "/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam") # A multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, # the second elements, the third elements, and so on. Arguments are recycled if necessary. #GRanges(seqnames=sb$rname, ranges=IRanges(start=sb$pos,width=v), strand=sb$strand) #Add 'chr' to chromosome names paste("read in .bam files") gr <- mapply(FUN=function(u,v) { sb <- scanBam(u, param=p)[[1]] GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand) },filenames,readLen) names(gr) <- c("HS1238_kd","HS1235_ctl") #names(gr) <- c("HS1235_ctl","HS1238_kd") # file names (here, MAP format) --> list of GRanges [much slower!] #fnmap <- c("/home/data/NGS/BGI/27-07-2010/BRE01/BRE01_lib1PE122.uniq.map.gz", # "/home/data/NGS/BGI/27-07-2010/BRE02/BRE02_lib2PE122.uniq.map.gz") #grmap <- lapply(fnmap, FUN=function(u) { # x <- readAligned(u, type="Bowtie") # GRanges(seqnames=chromosome(x), ranges=IRanges(start=position(x),width=readLen), # strand=strand(x)) #}) #grl <- GRangesList(grmap) # use this for MAP files grl <- GRangesList(gr) paste("create table of windows along the genome") windows <- genomeBlocks(Hsapiens, chrNames, windowSize) head(windows) #RangedData with 6 rows and 0 value columns across 24 spaces # space ranges | # | #1 chr1 [ 1, 1000] | #2 chr1 [1001, 2000] | #3 chr1 [2001, 3000] | #4 chr1 [3001, 4000] | #5 chr1 [4001, 5000] | #6 chr1 [5001, 6000] | print("create count table (after extending reads to be length 'fragSize')") counts <- annotationBlocksCounts(grl, windows, fragSize, verbose=TRUE) # MR: If colSums are 0, we'll have to fix the chromosome names problem. colSums(counts) # HS1238_kd HS1235_ctl # 19563732 26082144 print("filter on the total count for a bin") k <- rowSums(counts) > 10 print("do an edgeR DE analysis of counts") f <- calcNormFactors(counts[k,]) d <- DGEList(counts=counts[k,], group=colnames(counts), lib.size=colSums(counts[k,])*f, genes=as.data.frame(windows)[k,1:3]) d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning #Warning message: #In estimateCommonDisp(d) : # There is no replication. Setting common dispersion to 0. paste(" now do exactTest(d)") de <- exactTest(d,pair=c("HS1238_kd","HS1235_ctl")) # Comparison of groups: HS1235_ctl - HS1238_kd #de <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd")) # Comparison of groups: HS1238_kd - HS1235_ctl #> de #An object of class "deDGEList" #$table # logConc logFC p.value #1 -17.61765 1.02039588 2.954411e-06 #2 -20.89289 0.24495614 1.000000e+00 #3 -20.88492 -0.49612556 5.034447e-01 #4 -19.78170 0.32656991 5.327093e-01 #5 -20.39289 0.07503114 1.000000e+00 #1511146 more rows ... topTags(de) #> topTags(de) #Comparison of groups: HS1238_kd - HS1235_ctl # logConc logFC PValue FDR #2677274 -10.863195 -0.7603573 9.500483e-294 1.435666e-287 #121187 -8.015492 -0.2615755 1.802020e-252 1.361563e-246 #1294341 -9.974019 -0.4176546 6.519279e-166 3.283871e-160 #1606831 -12.850526 -1.0059503 2.006271e-130 7.579445e-125 #1604542 -12.880627 -0.9892234 1.805570e-123 5.456979e-118 #1293822 -14.726969 -1.6940994 4.573325e-100 1.151831e-94 #1719803 -15.587381 2.1549864 9.805623e-95 2.116825e-89 #2012668 -18.029993 -4.9018324 1.904948e-89 3.598330e-84 #2677275 -10.435734 -0.3546701 6.543402e-88 1.098674e-82 #1120720 -11.430976 -0.4916045 1.256872e-84 1.899324e-79 paste("table with genome locations, counts, statistics") xx <- cbind(as.data.frame(windows)[k,-5], counts[k,], de$table) #w <- xx$p.value < .00001 # these are not correct pvals, so you may want to be harsher than this. paste("write.table") write.table(xx, "/projects/remc_bigdata/Karsan/R/20101031/HS1238_vs_1235.w1000.edgeR.txt", sep="\t", row.names=FALSE, quote=FALSE) #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101031/HS1235_vs_1238.w1000.edgeR.txt", sep="\t", row.names=FALSE, quote=FALSE) #seqnames start end width HS1238_kd HS1235_ctl logConc logFC p.value #chr1 1 1000 1000 114 74 -17.6176485910214 1.02039588397656 2.95441057191974e-06 #chr1 1001 2000 1000 9 10 -20.8928937327534 0.244956141995718 0.999999999999992 #chr1 2001 3000 1000 7 13 -20.8849229608189 -0.49612556064272 0.503444671630861 #chr1 3001 4000 1000 20 21 -19.7816975220852 0.32656990754937 0.532709255236125 #chr1 4001 5000 1000 12 15 -20.3928937327534 0.0750311405534056 0.999999999999987 #chr1 5001 6000 1000 10 12 -20.6853749831140 0.133924829606974 0.999999999999999 #chr1 6001 7000 1000 14 16 -20.2351428198894 0.204314157498372 0.85553555190565 #chr1 7001 8000 1000 14 22 -20.0054270105708 -0.255117461138925 0.617719317029688 #chr1 8001 9000 1000 15 11 -20.4556591737953 0.84441821241199 0.122078120708466 #chr1 9001 10000 1000 11 12 -20.616623221239 0.271428353356909 0.677639484405523 #chr1 10001 11000 1000 15 15 -20.2319296853097 0.396959235440768 0.47312965989113 # find differential expression de<-exactTest(d) # highlighting the top 500 most DE tags de.tags <- rownames(topTags(de, n=500)$table) #pdf(file="/projects/remc_bigdata/Karsan/R/20101022/HS1238_vs_1235.w750.edgeR.plotSmear.pdf") #plotSmear(d) #de.tags <- rownames(topTags(de,n=500)$table) #plotSmear(d,de.tags=de.tags) #dev.off() # 21 Oct 2010 - For w=1000, uses a max of ~5.6G for Iva's data. # "/projects/remc_bigdata/Karsan/R/20101022/bammap2counts_DE.H4ac.R" #awk '{if($9<=0.000001) {print $0 > "HS1235_vs_1238.w500.edgeR.p1e-6.txt"}}' HS1235_vs_1238.w500.edgeR.txt