R version 2.11.1 (2010-05-31) Copyright (C) 2010 The R Foundation for Statistical Computing ISBN 3-900051-07-0 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > # 21 October 2010 > # From Mark Robinson, 18 October 2010 > # Ongoing help from Mark... > > # Run this script by e.g. > # source(file="/projects/remc_bigdata/Karsan/R/20101022/bammap2counts_DE.H4ac.R") > > # source("http://bioconductor.org/biocLite.R") > # old.packages(repos=biocinstallRepos()) > # update.packages(repos=biocinstallRepos(), ask=FALSE) > > #install.packages("Repitools",repos="http://r-forge.r-project.org",type="source") > # biocLite("Rsamtools") 1.2.0 > > # biocLite("BSgenome.Hsapiens.UCSC.hg18") 834 M as a tar.gz > # biocLite("BSgenome.Mmusculus.UCSC.mm8") 747MB > # R CMD REMOVE BSgenome.Hsapiens.UCSC.hg18 > # R CMD INSTALL /projects/remc_bigdata/Data/BSgenome.Hsapiens.UCSC.hg18_1.3.16.tar.gz > > # load packages > #library(IRanges) > #library(Biostrings) > #library(GenomicRanges) > #library(IRanges) > > library(Rsamtools) Loading required package: IRanges Attaching package: 'IRanges' The following object(s) are masked from 'package:base': cbind, Map, mapply, order, paste, pmax, pmax.int, pmin, pmin.int, rbind, rep.int, table Loading required package: GenomicRanges Loading required package: Biostrings > library(Repitools) Loading required package: R.methodsS3 R.methodsS3 v1.2.1 (2010-09-18) successfully loaded. See ?R.methodsS3 for help. Loading required package: BSgenome > library(edgeR) > library(BSgenome.Hsapiens.UCSC.hg18) > > > # some parameter settings > # need to add a failed-chastity filter and a MAPQ filter > p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) > > # 1000, 750, 500 bp windows > windowSize <- 750 > fragSize <- 200 > chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "") > > # file names (here, BAM format) --> list of GRanges > # HS1238 is 50 bp, HS1235 is 75 bp, both are SE reads > # Compare HS1238 then HS1235 > #readLen <- c(50,75) # vector to match filenames > #filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", > # "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam") > # OR compare HS1235 then HS1238 > readLen <- c(75,50) # vector to match filenames > filenames <- c("/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam", + "/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam") > > # A multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, > # the second elements, the third elements, and so on. Arguments are recycled if necessary. > #GRanges(seqnames=sb$rname, ranges=IRanges(start=sb$pos,width=v), strand=sb$strand) > gr <- mapply(FUN=function(u,v) { + sb <- scanBam(u, param=p)[[1]] + GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand) + },filenames,readLen) > > #names(gr) <- c("HS1238_kd","HS1235_ctl") > names(gr) <- c("HS1235_ctl","HS1238_kd") > > # file names (here, MAP format) --> list of GRanges [much slower!] > #fnmap <- c("/home/data/NGS/BGI/27-07-2010/BRE01/BRE01_lib1PE122.uniq.map.gz", > # "/home/data/NGS/BGI/27-07-2010/BRE02/BRE02_lib2PE122.uniq.map.gz") > #grmap <- lapply(fnmap, FUN=function(u) { > # x <- readAligned(u, type="Bowtie") > # GRanges(seqnames=chromosome(x), ranges=IRanges(start=position(x),width=readLen), > # strand=strand(x)) > #}) > #grl <- GRangesList(grmap) # use this for MAP files > > grl <- GRangesList(gr) > > # create "table" of windows along the genome > windows <- genomeBlocks(Hsapiens, chrNames, windowSize) > > head(windows) GRanges with 6 ranges and 0 elementMetadata values seqnames ranges strand | | [1] chr1 [ 1, 750] * | [2] chr1 [ 751, 1500] * | [3] chr1 [1501, 2250] * | [4] chr1 [2251, 3000] * | [5] chr1 [3001, 3750] * | [6] chr1 [3751, 4500] * | seqlengths chr1 chr2 chr3 chr4 chr5 chr6 ... chr19 chr20 chr21 chr22 chrX chrY NA NA NA NA NA NA ... NA NA NA NA NA NA > #RangedData with 6 rows and 0 value columns across 24 spaces > # space ranges | > # | > #1 chr1 [ 1, 1000] | > #2 chr1 [1001, 2000] | > #3 chr1 [2001, 3000] | > #4 chr1 [3001, 4000] | > #5 chr1 [4001, 5000] | > #6 chr1 [5001, 6000] | > > > # create count table (after extending reads to be length 'fragSize') > counts <- annotationBlocksCounts(grl, windows, fragSize, verbose=TRUE) Counting in HS1235_ctl Counting in HS1238_kd Counting successful. > # MR: If colSums are 0, we'll have to fix the chromosome names problem. > colSums(counts) HS1235_ctl HS1238_kd 27515375 20638484 > # HS1238_kd HS1235_ctl > # 19563732 26082144 > > k <- rowSums(counts) > 10 # filter on the total count for a bin > > # do an edgeR DE analysis of counts > f <- calcNormFactors(counts[k,]) > > d <- DGEList(counts=counts[k,], + group=colnames(counts), + lib.size=colSums(counts[k,])*f, + genes=as.data.frame(windows)[k,1:3]) > d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning Warning message: In estimateCommonDisp(d) : There is no replication. Setting common dispersion to 0. > #Warning message: > #In estimateCommonDisp(d) : > # There is no replication. Setting common dispersion to 0. > > de <- exactTest(d) # 4.6GB Comparison of groups: HS1238_kd - HS1235_ctl > #> de > > proc.time() user system elapsed 713.165 25.815 745.258