# 19 October 2010 # From Mark Robinson, 18 October 2010 #install.packages("Repitools",repos="http://r-forge.r-project.org",type="source") #source("http://bioconductor.org/biocLite.R") #biocLite("Rsamtools") 1.2.0 #biocLite("BSgenome.Hsapiens.UCSC.hg18") 834 M as a tar.gz # load packages library(IRanges) library(Biostrings) library(GenomicRanges) library(IRanges) library(Rsamtools) library(Repitools) library(edgeR) library(BSgenome.Hsapiens.UCSC.hg18) # some parameter settings p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) windowSize <- 1000 fragSize <- 200 rdlen <- c(50,75) # vector to match filenames chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "") # file names (here, BAM format) --> list of GRanges # HS1238 is 50 bp, HS1235 is 75 bp, both are SE reads filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam") # A multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, # the second elements, the third elements, and so on. Arguments are recycled if necessary. gr <- mapply(FUN=function(u,v) { sb <- scanBam(u, param=p)[[1]] GRanges(seqnames=sb$rname, ranges=IRanges(start=sb$pos,width=v), strand=sb$strand) },filenames,rdlen) names(gr) <- c("HS1238_kd","HS1235_ctl") # file names (here, MAP format) --> list of GRanges [much slower!] #fnmap <- c("/home/data/NGS/BGI/27-07-2010/BRE01/BRE01_lib1PE122.uniq.map.gz", # "/home/data/NGS/BGI/27-07-2010/BRE02/BRE02_lib2PE122.uniq.map.gz") #grmap <- lapply(fnmap, FUN=function(u) { # x <- readAligned(u, type="Bowtie") # GRanges(seqnames=chromosome(x), ranges=IRanges(start=position(x),width=readLen), # strand=strand(x)) #}) #grl <- GRangesList(grmap) # use this for MAP files grl <- GRangesList(gr) # create "table" of windows along the genome windows <- genomeBlocks(Hsapiens, chrNames, windowSize) #> head(windows) #RangedData with 6 rows and 0 value columns across 24 spaces # space ranges | # | #1 chr1 [ 1, 1000] | #2 chr1 [1001, 2000] | #3 chr1 [2001, 3000] | #4 chr1 [3001, 4000] | #5 chr1 [4001, 5000] | #6 chr1 [5001, 6000] | # create count table (after extending reads to be length 'fragSize') counts <- annotationBlocksCounts(grl, windows, fragSize, verbose=TRUE) k <- rowSums(counts) > 10 # filter on the total count for a bin # do an edgeR DE analysis of counts f <- calcNormFactors(counts[k,]) d <- DGEList(counts=counts[k,], group=colnames(counts), lib.size=colSums(counts[k,])*f, genes=as.data.frame(wind)[k,1:3]) d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning de <- exactTest(d) topTags(de)