R version 2.11.1 (2010-05-31)
Copyright (C) 2010 The R Foundation for Statistical Computing
ISBN 3-900051-07-0

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> # 01 November 2010
> # From Mark Robinson, 18 October 2010
> # Ongoing help from Mark...
> 
> 
> # Run this script by e.g.
> # source(file="/projects/remc_bigdata/Karsan/R/20101206/bammap2counts_DE-H4ac.20101206.R")
> # source(file="/projects/remc_bigdata/Karsan/R/20101102/bammap2counts_DE-H4ac.20101102.R")
> # puzzle, 31 October 2010: for HS1235 vs HS1238, get 38 reported first (see below)
> #Counting in HS1235_ctl 
> #Counting in HS1238_kd 
> #Counting successful.
> #Comparison of groups:  HS1238_kd - HS1235_ctl
> 
> 
> # source("http://bioconductor.org/biocLite.R")
> # old.packages(repos=biocinstallRepos())
> # update.packages(repos=biocinstallRepos(), ask=FALSE)
> 
> #install.packages("Repitools",repos="http://r-forge.r-project.org",type="source")
> # biocLite("Rsamtools") 1.2.0
> 
> # biocLite("BSgenome.Hsapiens.UCSC.hg18") 834 M as a tar.gz
> # biocLite("BSgenome.Mmusculus.UCSC.mm8")  747MB
> # R CMD REMOVE BSgenome.Hsapiens.UCSC.hg18
> # R CMD INSTALL /projects/remc_bigdata/Data/BSgenome.Hsapiens.UCSC.hg18_1.3.16.tar.gz
> 
> # load packages
> #library(IRanges)
> #library(Biostrings)
> #library(GenomicRanges)
> #library(IRanges)
> 
> library(Rsamtools)
Loading required package: IRanges

Attaching package: 'IRanges'

The following object(s) are masked from 'package:base':

    cbind, Map, mapply, order, paste, pmax, pmax.int, pmin, pmin.int,
    rbind, rep.int, table

Loading required package: GenomicRanges
Loading required package: Biostrings
> library(Repitools)
Loading required package: R.methodsS3
R.methodsS3 v1.2.1 (2010-09-18) successfully loaded. See ?R.methodsS3 for help.
Loading required package: BSgenome
> library(edgeR)
> library(BSgenome.Hsapiens.UCSC.hg18)
> 
> 
> print("some parameter settings")
[1] "some parameter settings"
> # need to add a failed-chastity filter and a MAPQ filter
> p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE))
> 
> # 1000, 750, 500 bp windows
> windowSize <- 1000
> #windowSize <- 750
> #windowSize <- 500
> fragSize <- 200
> chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "")
> #print(paste("w=",windowSize,sep="")
> #print(paste("fragSize=",fragSize,sep=""))
> 
> # file names (here, BAM format) --> list of GRanges
> # HS1238 is 50 bp, HS1235 is 75 bp, both are SE reads
> # Compare HS1238 then HS1235
> readLen <- c(50,75)  # vector to match filenames
> filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam",
+                "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam")
> # OR compare HS1235 then HS1238
> #readLen <- c(75,50)  # vector to match filenames
> #filenames <- c("/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam",
> #               "/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam")
> 
> # A multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, 
> # the second elements, the third elements, and so on. Arguments are recycled if necessary.
> #GRanges(seqnames=sb$rname, ranges=IRanges(start=sb$pos,width=v), strand=sb$strand)
> #Add 'chr' to chromosome names
> print("read in .bam files")
[1] "read in .bam files"
> gr <- mapply(FUN=function(u,v) {
+   sb <- scanBam(u, param=p)[[1]]
+   GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand)
+ },filenames,readLen)
> 
> names(gr) <- c("HS1238_kd","HS1235_ctl")
> #names(gr) <- c("HS1235_ctl","HS1238_kd")
> 
> # file names (here, MAP format) --> list of GRanges [much slower!]
> #fnmap <- c("/home/data/NGS/BGI/27-07-2010/BRE01/BRE01_lib1PE122.uniq.map.gz",
> #           "/home/data/NGS/BGI/27-07-2010/BRE02/BRE02_lib2PE122.uniq.map.gz")
> #grmap <- lapply(fnmap, FUN=function(u) {
> #  x <- readAligned(u, type="Bowtie")
> #  GRanges(seqnames=chromosome(x), ranges=IRanges(start=position(x),width=readLen), 
> #          strand=strand(x))
> #})
> #grl <- GRangesList(grmap)  # use this for MAP files
> 
> grl <- GRangesList(gr)
> 
> print("create table of windows along the genome")
[1] "create table of windows along the genome"
> windows <- genomeBlocks(Hsapiens, chrNames, windowSize)
> 
> head(windows)
GRanges with 6 ranges and 0 elementMetadata values
    seqnames       ranges strand |
       <Rle>    <IRanges>  <Rle> |
[1]     chr1 [   1, 1000]      * |
[2]     chr1 [1001, 2000]      * |
[3]     chr1 [2001, 3000]      * |
[4]     chr1 [3001, 4000]      * |
[5]     chr1 [4001, 5000]      * |
[6]     chr1 [5001, 6000]      * |

seqlengths
  chr1  chr2  chr3  chr4  chr5  chr6 ... chr19 chr20 chr21 chr22  chrX  chrY
    NA    NA    NA    NA    NA    NA ...    NA    NA    NA    NA    NA    NA
> #RangedData with 6 rows and 0 value columns across 24 spaces
> #        space       ranges |
> #  <character>    <IRanges> |
> #1        chr1 [   1, 1000] |
> #2        chr1 [1001, 2000] |
> #3        chr1 [2001, 3000] |
> #4        chr1 [3001, 4000] |
> #5        chr1 [4001, 5000] |
> #6        chr1 [5001, 6000] |
> 
> 
> print("create count table after extending reads to be length fragSize")
[1] "create count table after extending reads to be length fragSize"
> counts <- annotationBlocksCounts(grl, windows, fragSize, verbose=TRUE)
Counting in HS1238_kd 
Counting in HS1235_ctl 
Counting successful.
> # MR: If colSums are 0, we'll have to fix the chromosome names problem.
> colSums(counts)
 HS1238_kd HS1235_ctl 
  19563732   26082144 
> # HS1238_kd HS1235_ctl 
> #  19563732   26082144
> 
> #print("filter on the total count for a bin")
> k <- rowSums(counts) > 10  
> 
> #print("do an edgeR DE analysis of counts")
> f <- calcNormFactors(counts[k,])
> 
> d <- DGEList(counts=counts[k,], 
+ 	         group=colnames(counts), 
+              lib.size=colSums(counts[k,])*f,
+              genes=as.data.frame(windows)[k,1:3])
> d <- estimateCommonDisp(d)  # in my example, no bio replicates, will get warning
Warning message:
In estimateCommonDisp(d) :
  There is no replication.  Setting common dispersion to 0.
> # Write out a smear plot PDF
> #pdf("/projects/remc_bigdata/Karsan/R/20101101/DE-H4ac.1000bp-bins.edgeR.20101101.smearPlot.pdf")
> #pdf("/projects/remc_bigdata/Karsan/R/20101101/DE-H4ac.500bp-bins.edgeR.20101101.smearPlot.pdf")
> #plotSmear(d)
> #grid()
> #dev.off()
> 
> #Warning message:
> #In estimateCommonDisp(d) :
> #  There is no replication.  Setting common dispersion to 0.
> print("do exactTest(d)")
[1] "do exactTest(d)"
> #de <- exactTest(d,pair=c("HS1238_kd","HS1235_ctl")) # Comparison of groups:  HS1235_ctl - HS1238_kd
> de <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd"))  # Comparison of groups:  HS1238_kd - HS1235_ctl - makes pos-FC=1238-enriched
Comparison of groups:  HS1238_kd - HS1235_ctl 
> 
> #> de
> #An object of class "deDGEList"
> #$table
> #    logConc       logFC      p.value
> #1 -17.61765  1.02039588 2.954411e-06
> #2 -20.89289  0.24495614 1.000000e+00
> #3 -20.88492 -0.49612556 5.034447e-01
> #4 -19.78170  0.32656991 5.327093e-01
> #5 -20.39289  0.07503114 1.000000e+00
> #1511146 more rows ...
> 
> topTags(de)
Comparison of groups:  HS1238_kd - HS1235_ctl 
           logConc      logFC        PValue           FDR
2677274 -10.863195 -0.7603573 9.500483e-294 1.435666e-287
121187   -8.015492 -0.2615755 1.802020e-252 1.361563e-246
1294341  -9.974019 -0.4176546 6.519279e-166 3.283871e-160
1606831 -12.850526 -1.0059503 2.006271e-130 7.579445e-125
1604542 -12.880627 -0.9892234 1.805570e-123 5.456979e-118
1293822 -14.726969 -1.6940994 4.573325e-100  1.151831e-94
1719803 -15.587381  2.1549864  9.805623e-95  2.116825e-89
2012668 -18.029993 -4.9018324  1.904948e-89  3.598330e-84
2677275 -10.435734 -0.3546701  6.543402e-88  1.098674e-82
1120720 -11.430976 -0.4916045  1.256872e-84  1.899324e-79
> #> topTags(de)
> #Comparison of groups:  HS1238_kd - HS1235_ctl 
> #           logConc      logFC        PValue           FDR
> #2677274 -10.863195 -0.7603573 9.500483e-294 1.435666e-287
> #121187   -8.015492 -0.2615755 1.802020e-252 1.361563e-246
> #1294341  -9.974019 -0.4176546 6.519279e-166 3.283871e-160
> #1606831 -12.850526 -1.0059503 2.006271e-130 7.579445e-125
> #1604542 -12.880627 -0.9892234 1.805570e-123 5.456979e-118
> #1293822 -14.726969 -1.6940994 4.573325e-100  1.151831e-94
> #1719803 -15.587381  2.1549864  9.805623e-95  2.116825e-89
> #2012668 -18.029993 -4.9018324  1.904948e-89  3.598330e-84
> #2677275 -10.435734 -0.3546701  6.543402e-88  1.098674e-82
> #1120720 -11.430976 -0.4916045  1.256872e-84  1.899324e-79
> 
> #print("table with genome locations, counts, statistics")
> #xx <- cbind(as.data.frame(windows)[k,-5], counts[k,], de$table)
> # Mark (31 Oct 2010) - add an B&H adjusted p-val to the output
> xx <- cbind(as.data.frame(windows)[k,-5], counts[k,], de$table,adjp=p.adjust(de$table$p.value, method="BH") )
> # Check the table
> table(xx$adjp < .05, xx$logFC > 0)
       
         FALSE   TRUE
  FALSE 763844 733655
  TRUE    6696   6956
> #         FALSE   TRUE
> #  FALSE 763844 733655
> #  TRUE    6696   6956
> 
> #w <- xx$p.value < .00001 # these are not correct pvals, so you may want to be harsher than this.
> # Get low-adjp records
> #w <- xx$adjp < 0.05
> w <- xx$p.value < 0.1
> xxx <- xx[w,]
> 
> print("write.tablem all records")
[1] "write.tablem all records"
> write.table(xx, "/projects/remc_bigdata/Karsan/R/20101212a/HS1238_vs_1235.bins-w1000.d.edgeR.all.20101212a.txt", sep="\t", row.names=FALSE, quote=FALSE)
> #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101212a/HS1238_vs_1235.bins-w750.d.edgeR.all.20101212a.txt", sep="\t", row.names=FALSE, quote=FALSE)
> 
> print("write.table, records with p-val < 0.01")
[1] "write.table, records with p-val < 0.01"
> write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101212a/HS1238_vs_1235.bins-w1000.d.edgeR.p0.1.20101212a.txt", sep="\t", row.names=FALSE, quote=FALSE)
> #write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101212a/HS1238_vs_1235.bins-w750.edgeR.p0.1.20101212a.txt", sep="\t", row.names=FALSE, quote=FALSE)
> #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101102/HS1238_vs_1235.w500.edgeR.p0.1.20101102.txt", sep="\t", row.names=FALSE, quote=FALSE)
> # with 'adjp'
> # 1              2       3       4       5               6               7       8       9       10
> #seqnames        start   end     width   HS1238_kd       HS1235_ctl      logConc logFC   p.value adjp
> #chr1    1       1000    1000    114     74      -17.6176485910214       1.02039588397656        2.95441057191974e-06    0.00171401942252625
> 
> # awk '{ if($10 <= 0.05) { print $0 > "HS1238-vs-1235.w1000.edgeR.adjp-le-0.05.txt" } }' HS1238_vs_1235.w1000.edgeR-pairs-adjp.txt
> #[grobertson@xhost09 20101031]$ wc -l HS1238_vs_1235.w1000.edgeR-pairs-adjp.txt
> # 1,511,152 HS1238_vs_1235.w1000.edgeR-pairs-adjp.txt
> #[grobertson@xhost09 20101031]$ wc -l HS1238-vs-1235.w1000.edgeR.adjp-le-0.05.txt
> # 13,652 HS1238-vs-1235.w1000.edgeR.adjp-le-0.05
> 
> 
> 
> #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101031/HS1235_vs_1238.w1000.edgeR.txt", sep="\t", row.names=FALSE, quote=FALSE)
> #seqnames        start   end     width   HS1238_kd       HS1235_ctl      logConc logFC   p.value
> #chr1    1       1000    1000    114     74      -17.6176485910214       1.02039588397656        2.95441057191974e-06
> #chr1    1001    2000    1000    9       10      -20.8928937327534       0.244956141995718       0.999999999999992
> #chr1    2001    3000    1000    7       13      -20.8849229608189       -0.49612556064272       0.503444671630861
> #chr1    3001    4000    1000    20      21      -19.7816975220852       0.32656990754937        0.532709255236125
> #chr1    4001    5000    1000    12      15      -20.3928937327534       0.0750311405534056      0.999999999999987
> #chr1    5001    6000    1000    10      12      -20.6853749831140       0.133924829606974       0.999999999999999
> #chr1    6001    7000    1000    14      16      -20.2351428198894       0.204314157498372       0.85553555190565
> #chr1    7001    8000    1000    14      22      -20.0054270105708       -0.255117461138925      0.617719317029688
> #chr1    8001    9000    1000    15      11      -20.4556591737953       0.84441821241199        0.122078120708466
> #chr1    9001    10000   1000    11      12      -20.616623221239        0.271428353356909       0.677639484405523
> #chr1    10001   11000   1000    15      15      -20.2319296853097       0.396959235440768       0.47312965989113
> 
> # find differential expression de<-exactTest(d)
> # highlighting the top 500 most DE tags de.tags <- rownames(topTags(de, n=500)$table)
> #pdf(file="/projects/remc_bigdata/Karsan/R/20101022/HS1238_vs_1235.w750.edgeR.plotSmear.pdf")
> #plotSmear(d)
> #de.tags <- rownames(topTags(de,n=500)$table)
> #plotSmear(d,de.tags=de.tags)
> #dev.off()
> 
> # 21 Oct 2010 - For w=1000, uses a max of ~5.6G for Iva's data.
> # "/projects/remc_bigdata/Karsan/R/20101022/bammap2counts_DE.H4ac.R"
> 
> #awk '{if($9<=0.000001) {print $0 > "HS1235_vs_1238.w500.edgeR.p1e-6.txt"}}' HS1235_vs_1238.w500.edgeR.txt
> 
> ###############
> #print("now do the opposite DE calculation")
> #de1 <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd"))  # Comparison of groups:  HS1238_kd - HS1235_ctl
> #topTags(de)
> #xx1 <- cbind(as.data.frame(windows)[k,-5], counts[k,], de1$table)
> #w1 <- xx$p.value < 0.1
> #xxx1 <- xx1[w,]
> #write.table(xx1, "/projects/remc_bigdata/Karsan/R/20101128_TSS/HS1235_vs_1238.w1000.edgeR.p0.1.20101102.txt", sep="\t", row.names=FALSE, quote=FALSE)
> #write.table(xx1, "/projects/remc_bigdata/Karsan/R/20101102/HS1235_vs_1238.w500.edgeR.p0.1.20101102.txt", sep="\t", row.names=FALSE, quote=FALSE)
> 
> 
> ##############################################################################
> ### 31 October 2010, Mark Robinson ###########################################
> print("-------------------------------------------")
[1] "-------------------------------------------"
> print("Now do differentially enriched TSS regions")
[1] "Now do differentially enriched TSS regions"
> library(rtracklayer)
Loading required package: RCurl
Loading required package: bitops
> session <- browserSession("UCSC")
> genome(session) <- "hg18"
> #trackNames(session) ## list the track names
> q <- ucscTableQuery(session, "knownGene")
> knownGene <- getTable(q)
> 
> 
> ### END TSS-merge code from Mark Robinson - 12 Dec 2010 ########################
> # merge TSSs with identical starts
> knownGene$position <- ifelse(knownGene$strand=="+", knownGene$txStart, knownGene$txEnd)
> key <- paste(knownGene$chrom, knownGene$position, sep=":")
> ukey <- unique(key)
> m <- match(ukey, key)
> mkg <- knownGene[m,c("chrom","strand","txStart","txEnd","name","position")]  # mkg = merged known genes
> 
> # create merged IDs
> rownames(mkg) <- ukey
> ids <- split(knownGene$name, key)