#!/usr/bin/env ruby script_name = "frag_profiler_with_area.rb" version = "26-Sept-2010:16h00" date = "2010-09-26" =begin INTENT 1. Generate per-location profiles for XSET overlap in symmetric or asymmetric windows around e.g. peak maxes, TF sites, TSS's, DE coordinates. As in Heintzmann Nature 2009, Figs. 1, 2. NEEDS 1. 'reads' files, per-chr. OR FindPeaks converted BED file: (from a MAP file) track name="chr22_HS1238_H4ac_hg18 Sequences" description="Illumina maq-aligned reads - HS1238_H4ac_hg18" visibility=4 priority=10 itemRgb="On" chr22 14430607 14430658 TtACTGtGTCAGAACAGAGTGTGCCGATTGTGGTCAGGACTCCATAGCAT 0 - 14430607 14430658 0,0,255 chr22 14430930 14430981 TGGGGGCCAAGGCCCTTGTTATGGGGAtGAAGGCTCTTAGGTGGTAGCCC 0 - 14430930 14430981 0,0,255 chr22 14430935 14430986 GCCAAGGCCcTTGtTaTGGGGATGaAGGctctTagGtGGtAGcCcTccaa 0 + 14430935 14430986 255,0,0 NB 1. We assume that 'reads' 'starts' are frag-starts that include directionality. I.e. a (+)-strand read's start is the frag's left end, and (-) is the right end. STATUS 26Sep2010 - Added FindPeaks4 DE call records, for Iva, HS1238/5 H4ac. 17may2010 - Foxa2, E14.5 liver, for Olivia 12feb2010 - MafA, MM0555-1Lane 05nov09 - Added UCSC knownGene transcripts (66803) instead of RefSeq transcripts. Seems OK. 17oct09 - Added asymmetric regions, e.g. -2kb/+500bp. Need to add region-gene-overlap detector. 05oct09 - Run chr22 locally for HS1238 and HS1235, for +/-2kb on REfSeq TSS's. Running on xhost06. 04oct09 - Run from a FP converted Maq to BED file, gzipped. 24mai09 - Ran new TF lists and a RefSeq list for Brad. Adjusted the TF record for ID=$4 rather than $3. This should not automatically handle both 3- and 4-field TF records. 12mai09 - Adjusted the configuration section for TF-site-based runs. Running on xhost06. 10mai09 - Now can do strand-corrected TSS-based profiles. Verified chr1 K4me3. 09mai09 - Adjusted profile subroutine, checked it, seems better. 04mai09 - Did a Foxa2-pkMax/Foxa2-reads test run. Brad's suggestion. mm8. 02mai09 - Runs. Semi-debugged with 2 small test cases. COMMENTS 1. For a TSS profile, for (-) strand, we generate a standard (+)-strand profile vector then reverse it and return it. This is simple to implement but only works for a symmetric window (which is what we have now). 2. The 'standard' TF list from Brad has his peak 'ID' as the fourth column. This ID should be retained in the profiles generated. TODO 1. Check strand vs. start coord for 'reads' files. 2. A separate script (flexibly) applies binning to reduce the file size. 3. Need to add region-gene-overlap detector. knownGene.txt uc001aaa.2 chr1 + 1115 4121 1115 1115 3 1115,2475,3083, 2090,2584,4121, uc001aaa.2 uc009vip.1 chr1 + 1115 4272 1115 1115 2 1115,2475, 2090,4272, uc009vip.1 uc001aab.2 chr1 - 4268 14764 4268 4268 10 4268,4832,5658,6469,6716,7095,7468,7777,8130,14600, 4692,4901,5810,6628,6918,7231,7605,7924,8242,14764, uc001aab.2 ... =end require 'zlib' #=========================== #assembly = "mm8" #assembly = "mm9" assembly = "hg18" #target_type = "tss" #target_type = "tf" #target_type = "tf_pred_sites" target_type = "de_coords" #test_data_or_real_data = "test_data" test_data_or_real_data = "real_data" #gene_type = "refFlat.txt" #gene_type = "knownGene.txt" data_type = "reads" #data_type = "fp_bed_gz" #host = "laptop" host = "xhost" #calc = "profile_only" #calc = "area_only" #calc = "profile_and_area" reads_chrs_A = %w[ chr22 ] #reads_chrs_A = %w[ chr1 chr2 chr3 chr4 ] xset_len = 200 window = "symmetric" #window = "asymmetric" case window when "symmetric" window_half_width = 500 #bp puts "using a 'symmetric' window: +/-#{500} bp" #raise "please change to asymmetric mode and use equal length half-widths" when "asymmetric" window_5pr_width = 2500 #bp window_3pr_width = 1500 #bp puts "using an 'asymmetric' window: 5':#{window_5pr_width}/3':#{window_3pr_width}bp" end puts "xset_len: #{xset_len} bp" #puts "window_half_width: #{window_half_width} bp" ### Reference targets: TSS's or TF sites ####################################### case target_type when "tf" case test_data_or_real_data when "real_data" # TF sites as targets to profile around case host when "laptop" tf_site_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/TF_sites" when "xhost" # Should move this down into each run's 'case' block tf_site_dir = "/home/grobertson/Profiles/TF_sites" end when "test_data" case host when "laptop" tf_site_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/Tests" tf_site_file = "tf_sites.test.txt" end tf_site_pathfile = File.join( tf_site_dir, tf_site_file ) puts "tf_site_file: #{tf_site_pathfile}" reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/Tests" reads_basefilename = "test_5." reads_label = "test_5" end when "de_coords" ### 26 September 2010 ####################################### de_coords_pathfile = "/projects/remc_bigdata/Karsan/HS1238_kd/de-coord_profiles/HS1238_compare_HS1235-alpha0.01_q10_w400_min3_subp0.5_triangle_subpeaks.regions.p1.0e-06.nofilt.20100728.txt" puts "using DE-coords: #{de_coords_pathfile}" end ### SET THIS ------------------------------------------------------------------- ### For Brad - mismatched - mouse - vs predicted Pdx1 binding sites, not peak maxes #run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0281-H3K4me3-adult-liver" # /projects/03/genereg/projects/SOLEXA/H3K4me3/MM0281/mm9/eland/reads # MM0281.um.eland.noDots.filtered.1.reads #run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0282-H3K4me1-adult-liver" # /projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm9/eland/reads #MM0282.um.eland.noDots.filtered.1.reads #run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0310-H4ac-adult-liver" # /projects/03/genereg/projects/SOLEXA/H4ac/MM0310/mm9/eland/reads # MM0310.um.eland.noDots.filtered.1.reads #run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0411-H3K27me3-adult-liver" # /projects/03/genereg/projects/SOLEXA/H3K27me3/MM0411/mm9/export/reads # 1.reads #run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0507-H3K9me3-adult-liver" # /projects/03/genereg/projects/SOLEXA/H3K9me3/MM0507/mm9/export/reads # 1.reads #------------------------------------------------------------------------------- # H4ac, HS1238, CSL knockdown # chr1_HS1238_H4ac_hg18_maq.bed.gz run = "H4ac_HS1238_CSL_knockdown" #run = "H4ac_HS1235_vectorControl" # Adult islets # ------------ # matched # ------- #run = "PMNrgns__MM0386-H3K4me1-adult-islets" #run = "MM0555-MafA-1L-2517pks_islets__MM0386-H3K4me1-islets" #run = "MM0555-MafA-1L-islets__MM0386-H3K4me1-islets" #run = "MM0325-Foxa2-islets__MM0386-H3K4me1-islets" #run = "MM0325-Foxa2-islets__MM0387-H3K4me3-islets" #run = "MM0388-Pdx1-adult-islets__MM0386-H3K4me1-adult-islets" #run = "MM0388-Pdx1-islets__MM0387-H3K4me3-islets" # mismatched # ---------- #run = "MM0325-Foxa2-islets__MM0282-H3K4me1-liver" #run = "MM0325-Foxa2-islets__MM0281-H3K4me3-liver" # RefSeq # ------ #run = "RefSeq__MM0386-H3K4me1-islets" #run = "RefSeq__MM0387-H3K4me3-islets" #---------------------------------------------------- # E14.5 liver #run = "MM0399-Foxa2-E14.5-liver__MM0393-H3K4me1-E14.5_liver" # Adult liver # ----------- # matched # ------- #run = "MM0261-Foxa2-liver__MM0282-H3K4me1-liver" #run = "MM0261-Foxa2-liver__MM0281-H3K4me3-liver" #run = "MM0351-Hnf4a-liver__MM0282-H3K4me1-liver" #run = "MM0351-Hnf4a-liver__MM0281-H3K4me3-liver" # mismatched # ---------- #run = "MM0261-Foxa2-liver__MM0386-H3K4me1-islets" #run = "MM0261-Foxa2-liver__MM0387-H3K4me3-islets" #run = "MM0261-Foxa2-adult-liver__MM0393-E14.5-H3K4me1-liver" # RefSeq # ------ #run = "RefSeq__MM0282-H3K4me1-liver" #run = "RefSeq__MM0281-H3K4me3-liver" #---------------------------------------------------- # Stat1 HeLa #run = "Stat1_HeLa__HS0260-4L_H3K4me1_unstim" #run = "Stat1_HeLa__HS0262-4L_H3K4me1_stim" puts run case run ################################################################################ when "H4ac_HS1238_CSL_knockdown" puts "H4ac_HS1238_CSL_knockdown" data_type = "fp_bed_gz" if target_type == "tss" #RefSeq or knownGene TSS's case gene_type when "refFlat.txt" transcript_file = "refFlat.txt" case host when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/hg18/RefSeq" when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/hg18/RefSeq" end when "knownGene.txt" transcript_file = "knownGene.txt" kgXref_file = "kgXref.txt" case host when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/hg18/knownGene" when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/hg18/knownGene" end end end reads_label = "HS1238_CSL-kd_hg18" bed_gz_basefilename = "HS1238_H4ac_hg18_maq.bed.gz" # Reads filenames: "chr22_HS1238_H4ac_hg18_maq.bed.gz" case host when "laptop" then bed_gz_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Karsan_Lab/Data/HS1238/BED" out_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Karsan_Lab/Data/HS1238/Profiles/20091105" when "xhost" then #bed_gz_dir = "/projects/mapp_etc/other/HS1238/bed_gz" bed_gz_dir = "/projects/remc_bigdata/Karsan/HS1238_kd/bed_gz" #out_dir = "/projects/mapp_etc/other/HS1238/profiles/2500-1500bp/knownGene" out_dir = "/projects/remc_bigdata/Karsan/HS1238_kd/de-coord_profiles/20100926/plus-minus_500bp" end # asymmetric #puts "window is: (+) 5'-#{window_5pr_width}/(-) +#{window_3pr_width}-3' bp" #outfile = "#{run}.5pr-#{window_5pr_width}_to_#{window_3pr_width}-3pr.x#{xset_len}.#{target_type}_profiles.#{date}.a.txt" #logfile = "#{run}.5pr-#{window_5pr_width}_to_#{window_3pr_width}-3pr.x#{xset_len}.#{target_type}_profiles.#{date}.a.LOG.txt" # symmetric outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}_profiles.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}_profiles.#{date}.LOG.txt" ################################################################################ when "H4ac_HS1235_vectorControl" puts "H4ac_HS1235_vectorControl" data_type = "fp_bed_gz" if target_type == "tss" #RefSeq TSS's case gene_type when "refFlat.txt" transcript_file = "refFlat.txt" case host when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/hg18/RefSeq" when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/hg18/RefSeq" end when "knownGene.txt" transcript_file = "knownGene.txt" kgXref_file = "kgXref.txt" case host when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/hg18/knownGene" when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/hg18/knownGene" end end end reads_label = "HS1235_vectorControl_hg18" bed_gz_basefilename = "HS1235_H4ac_hg18_vecCtl_maq.bed.gz" # Reads filenames: chr22_HS1235_H4ac_hg18_vecCtl_maq.bed.gz case host when "laptop" then bed_gz_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Karsan_Lab/Data/HS1235_vecCntl/BED" out_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Karsan_Lab/Data/HS1235_vecCntl/Profiles/20091105" when "xhost" then #bed_gz_dir = "/projects/mapp_etc/other/HS1235/bed_gz" bed_gz_dir = "/projects/remc_bigdata/Karsan/HS1235_ctl/bed_gz" #out_dir = "/projects/mapp_etc/other/HS1235/profiles/2500-1500bp/knownGene" out_dir = "/projects/remc_bigdata/Karsan/HS1238_kd/de-coord_profiles/20100926/plus-minus_500bp" end # asymmetric #puts "window is: (+) 5'-#{window_5pr_width}/(-) +#{window_3pr_width}-3' bp" outfile = "#{run}.5pr-#{window_5pr_width}_to_#{window_3pr_width}-3pr.x#{xset_len}.#{target_type}_profiles.#{date}.a.txt" logfile = "#{run}.5pr-#{window_5pr_width}_to_#{window_3pr_width}-3pr.x#{xset_len}.#{target_type}_profiles.#{date}.a.LOG.txt" # symmetric #outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}_profiles.#{date}.a.txt" #logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}_profiles.#{date}.a.LOG.txt" ################################################################################ # 30 July 2010 when run = "PMNrgns__MM0386-H3K4me1-adult-islets" puts run tf_site_file = "PMNtripleboundloci_locusmiddle.3cols.20100720.bed" # chr3 5431143 1 reads_label = "PMNrgns__MM0386_H3K4me1_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm9/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/PMNrgns__MM0386_H3K4me1_adult_islets/mm9/plus-minus-2kb" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" # 17 May 2010 when "MM0399-Foxa2-E14.5-liver__MM0393-H3K4me1-E14.5_liver" puts run tf_site_file = "Foxa2_E14.5_liver_10847-peakmax.mm8.20100517.txt" # chr3 5431143 1 reads_label = "MM0393_H3K4me1_E14.5_liver" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0393/mm8/export/reads" out_dir = "/home/grobertson/Profiles/MM0399_Foxa2_E14.5_liver/MM0399_Foxa2-ab_E14.5_liver__MM0393_H3K4me1_mm8/mm8/plus-minus-2kb" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ ### Mouse adult islets ################################################################################ # 17 Feb 2010 - 2517 filtered peaks, mm9 when "MM0555-MafA-1L-2517pks_islets__MM0386-H3K4me1-islets" tf_site_file = "Mafa_2517_ht3_NO_input_NO_tower_peak_max_liftedtomm9_peakmaxcoord.bed" # chr3 5431143 1 reads_label = "MM0386_H3K4me1_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm9/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0555_MafA_adult-islets/MM0555_MafA-islets__MM0386-H3K4me1-adult-islets/mm9/plus-minus-2kb" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" # 12 Feb 2010 when run = "MM0555-MafA-1L-islets__MM0386-H3K4me1-islets" tf_site_file = "MafA_MM0555-1Lane_mm8_peakMaxCoords.txt" # chr3 5431143 1 reads_label = "MM0386_H3K4me1_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0555_MafA_adult-islets/MM0555_MafA-islets__MM0386-H3K4me1-adult-islets/mm8/plus-minus-2kb" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" when "MM0325-Foxa2-islets__MM0386-H3K4me1-islets" # matched tf_site_file = "Foxa2_Islet_peakmax_mm8.txt" # chr3 5431143 1 reads_label = "MM0386_H3K4me1_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0386-H3K4me1-adult-islets/mm8/plus-minus-10kb" #when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0386-H3K4me1-adult-islets/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "MM0325-Foxa2-islets__MM0387-H3K4me3-islets" # matched tf_site_file = "Foxa2_Islet_peakmax_mm8.txt" # chr3 5431143 1 reads_label = "MM0387_H3K4me3_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0387-H3K4me3-adult-islets/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ### Pdx1 - 20 June 2009 ######################################################## when "MM0388-Pdx1-adult-islets__MM0386-H3K4me1-adult-islets" # matched # Corrected FP2 subpeaks, 19june09 #tf_site_dir = "/projects/03/genereg/projects/SOLEXA/Pdx1/MM0388/mm8/data" #tf_site_file = "Pdx1_MM0388_7L_mm8_xset200_dupsN_ht11.sub.peaks" # FP2: 1 3 3000592 3001087 206 12 23 #tf_site_file = "GalaxyHistoryItem-300-[Pdx_Islet_BED_peakmax.txt].bed.txt" # chr3 5431143 5431144 1 tf_site_file = "Pdx1_MM0388_7L_mm8_xset200_dupsN_ht11.sub.peaks.peakmax_3col.txt" # chr3 3000797 1, 29 June 2009 reads_label = "MM0386_H3K4me1_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0388_Pdx1_adult_islets/MM0388_Pdx1-islets__MM0386-H3K4me1-adult-islets/mm8/20090629" #out_dir = "/home/grobertson/Profiles/MM0388_Pdx1_adult_islets/MM0388_Pdx1-islets__MM0386-H3K4me1-adult-islets/mm8" end outfile = "#{run}.pm#{window_half_width}bp.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.pm#{2*window_half_width}bp.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ### 24 May 2009 ################################################################ when "MM0388-Pdx1-islets__MM0387-H3K4me3-islets" # matched #tf_site_file = "GalaxyHistoryItem-300-[Pdx_Islet_BED_peakmax.txt].bed.txt" # chr3 5431143 5431144 1 tf_site_file = "Pdx1_MM0388_7L_mm8_xset200_dupsN_ht11.sub.peaks.peakmax_3col.txt" # chr3 3000797 1, 29 June 2009 reads_label = "MM0387_H3K4me3_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0388_Pdx1_adult_islets/MM0388_Pdx1-islets__MM0387-H3K4me3-adult-islets/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "MM0325-Foxa2-islets__MM0282-H3K4me1-liver" # mismatched tf_site_file = "Foxa2_Islet_peakmax_mm8.txt" # chr3 5431143 1 reads_label = "MM0282_H3K4me1_adult_liver" reads_basefilename = "MM0282.um.eland.noDots.filtered." # Reads filenames: MM0282.um.eland.noDots.filtered.10.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm8/eland/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0282-H3K4me1-adult-liver/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "MM0325-Foxa2-islets__MM0281-H3K4me3-liver" # mismatched tf_site_file = "Foxa2_Islet_peakmax_mm8.txt" # chr3 5431143 1 reads_label = "MM0387_H3K4me3_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0281-H3K4me3-adult-liver/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "RefSeq__MM0386-H3K4me1-islets" refseq_file = "refFlat.txt" case host when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/mm8/RefSeq" when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/mm8/RefSeq" end reads_label = "MM0386_H3K4me1_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0386_H3K4me1_adult_islets/export/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/RefSeq__MM0386-H3K4me1-adult-islets/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "RefSeq__MM0387-H3K4me3-islets" refseq_file = "refFlat.txt" case host when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/mm8/RefSeq" when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/mm8/RefSeq" end reads_label = "MM0387_H3K4me3_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0387_H3K4me3_adult_islets/maq/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/RefSeq__MM0387-H3K4me3-adult-islets/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ ### Mouse adult liver ################################################################################ when "MM0261-Foxa2-liver__MM0282-H3K4me1-liver" # matched tf_site_file = "Foxa2_Liver_peakmax_mm8.txt" reads_label = "MM0282_H3K4me1_adult_liver" reads_basefilename = "MM0282.um.eland.noDots.filtered." # Reads filenames: MM0282.um.eland.noDots.filtered.10.reads case host when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0282_H3K4me1_adult_liver/eland/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm8/eland/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0282-H3K4me1-adult-liver/mm8/plus-minus-10kb" #when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0282-H3K4me1-adult-liver/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "MM0261-Foxa2-adult-liver__MM0393-E14.5-H3K4me1-liver" # matched tf_site_file = "Foxa2_Liver_peakmax_mm8.txt" reads_label = "MM0393_H3K4me1_E14.5_liver" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise #reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0282_H3K4me1_adult_liver/eland/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0393/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261_Foxa2_adult_liver/MM0261-Foxa2-adult-liver__MM0393-H3K4me1-E14.5-liver/mm8" end outfile = "#{run}.pm#{2*window_half_width}bp.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.pm#{2*window_half_width}bp.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "MM0261-Foxa2-liver__MM0281-H3K4me3-liver" # matched tf_site_file = "Foxa2_Liver_peakmax_mm8.txt" reads_label = "MM0281_H3K4me3_adult_liver" reads_basefilename = "MM0281.um.eland.noDots.filtered." # Reads filenames: MM0281.um.eland.noDots.filtered.10.reads case host when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0281/mm8/eland/reads" # 04mai09 end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0281-H3K4me3-adult-liver/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ### 24 May 2009 ################################################################ when "MM0351-Hnf4a-liver__MM0282-H3K4me1-liver" tf_site_file = "GalaxyHistoryItem-301-[Hnf4a_Liver_BED_peakmax.txt].bed.txt" reads_label = "MM0282_H3K4me1_adult_liver" reads_basefilename = "MM0282.um.eland.noDots.filtered." # Reads filenames: MM0282.um.eland.noDots.filtered.10.reads case host when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0282_H3K4me1_adult_liver/eland/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm8/eland/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0351_Hnf4a_adult_liver/MM0351-Hnf4a-liver__MM0282-H3K4me1-adult-liver/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ### 24 May 2009 ################################################################ when "MM0351-Hnf4a-liver__MM0281-H3K4me3-liver" tf_site_file = "GalaxyHistoryItem-301-[Hnf4a_Liver_BED_peakmax.txt].bed.txt" reads_label = "MM0281_H3K4me3_adult_liver" reads_basefilename = "MM0281.um.eland.noDots.filtered." # Reads filenames: MM0281.um.eland.noDots.filtered.10.reads case host when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0281/mm8/eland/reads" # 04mai09 end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0351_Hnf4a_adult_liver/MM0351-Hnf4a-liver__MM0281-H3K4me3-adult-liver/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "MM0261-Foxa2-liver__MM0386-H3K4me1-islets" # mismatched tf_site_file = "Foxa2_Liver_peakmax_mm8.txt" reads_label = "MM0386_H3K4me1_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0386-H3K4me1-adult-islets/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "MM0261-Foxa2-liver__MM0387-H3K4me3-islets" # mismatched tf_site_file = "Foxa2_Liver_peakmax_mm8.txt" reads_label = "MM0387_H3K4me3_adult_islets" reads_basefilename = "" # Reads filenames: 1.reads case host when "laptop" then raise "not ready yet" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0387-H3K4me3-adult-islets/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "RefSeq__MM0282-H3K4me1-liver" refseq_file = "refFlat.txt" case host when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/mm8/RefSeq" when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/mm8/RefSeq" end reads_label = "MM0282_H3K4me1_adult_liver" reads_basefilename = "MM0282.um.eland.noDots.filtered." # Reads filenames: MM0282.um.eland.noDots.filtered.10.reads case host when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0282_H3K4me1_adult_liver/eland/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm8/eland/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/RefSeq__MM0282-H3K4me1-adult-liver/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "RefSeq__MM0281-H3K4me3-liver" refseq_file = "refFlat.txt" case host when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/mm8/RefSeq" when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/mm8/RefSeq" end reads_label = "MM0281_H3K4me3_adult_liver" reads_basefilename = "MM0281.um.eland.noDots.filtered." # Reads filenames: MM0281.um.eland.noDots.filtered.10.reads case host when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0281/mm8/eland/reads" # 04mai09 end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/RefSeq__MM0281-H3K4me3-adult-liver/mm8" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "Stat1_HeLa__HS0260-4L_H3K4me1_unstim" tf_site_file = "Stat1_unstim_stim_shared_BED.txt" reads_basefilename = "HS0260.um.eland.noDots.filtered" case host when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/HeLa/H3K4/HS0260_4lanes/eland/reads" end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/Stat1_HeLa/HS0260-4L_H3K4me1_unstim" end outfile = "#{run}.pm#{window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.pm#{window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" ################################################################################ when "Stat1_HeLa__HS0262-4L_H3K4me1_stim" tf_site_file = "Stat1_unstim_stim_shared_BED.txt" reads_basefilename = "HS0262.um.eland.noDots.filtered." case host when "laptop" then raise reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads" when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/HeLa/H3K4/HS0262_4lanes/eland/reads" # 04mai09 end case host when "laptop" then out_dir = reads_dir when "xhost" then out_dir = "/home/grobertson/Profiles/Stat1_HeLa/HS0262-4L_H3K4me1_stim" end outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt" logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt" end ################################################################################ if target_type == "tf" then puts "tf_site_dir: #{tf_site_dir}" puts "tf_site_file: #{tf_site_file}" tf_site_pathfile = File.join( tf_site_dir, tf_site_file ) end # Choose the TF site (or peak-max) file #tf_site_file = "Foxa2_Islet_peakmax.txt" # chr3 5431143 1 #tf_site_file = "Foxa2_Liver_peakmax.txt" #=========================== puts "target_type: #{target_type}" puts "test_data_or_real_data: #{test_data_or_real_data}" puts "host: #{host}" puts "reads_basefilename: #{reads_basefilename}" case test_data_or_real_data when "test_data" then outfile = "#{reads_basefilename}pm#{window_half_width}.x#{xset_len}.#{target_type}.test.txt" logfile = "#{reads_basefilename}pm#{window_half_width}.x#{xset_len}.#{target_type}.test.LOG.txt" end # Profiles out_pathfile = File.join(out_dir,outfile) begin profile_wh = File.open(out_pathfile,'w') puts "output file is: #{out_pathfile}" rescue raise "could not open #{out_pathfile}" end puts "writing output to: #{out_pathfile}" # Logfile log_pathfile = File.join(out_dir,logfile) begin log_wh = File.open(log_pathfile,'w') puts "LOG file is: #{log_pathfile}" rescue raise "could not open #{log_pathfile}" end log_wh.puts "script_name = #{script_name}" log_wh.puts "version = #{version}" log_wh.puts "host = #{host}" log_wh.puts "target_type = #{target_type}" log_wh.puts "test_data_or_real_data = #{test_data_or_real_data}" log_wh.puts "xset_len = #{xset_len}" log_wh.puts "window_half_width = #{window_half_width} bp" log_wh.puts "tf_site_dir = #{tf_site_dir}" log_wh.puts "tf_site_file = #{tf_site_file}" log_wh.puts "reads_dir = #{reads_dir}" log_wh.puts "reads_basefilename = #{reads_basefilename}.*chr*.reads" log_wh.puts "out_dir = #{out_dir}" log_wh.puts "outfile = #{outfile}" ### FUNCTIONS ################################################################## ################################################################################ def decimal_places(x,n) y = ((x * (10**n)).to_i).to_f/(10**n) return y end def commify(number) # Does this work on a float? n_to_s = number.to_s c = n_to_s.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') return c end ################################################################################ =begin CREATE TABLE `refFlat` ( `geneName` varchar(255) NOT NULL default '', `name` varchar(255) NOT NULL default '', `chrom` varchar(255) NOT NULL default '', `strand` char(1) NOT NULL default '', `txStart` int(10) unsigned NOT NULL default '0', `txEnd` int(10) unsigned NOT NULL default '0', `cdsStart` int(10) unsigned NOT NULL default '0', `cdsEnd` int(10) unsigned NOT NULL default '0', `exonCount` int(10) unsigned NOT NULL default '0', `exonStarts` longblob NOT NULL, `exonEnds` longblob NOT NULL, KEY `name` (`name`(12)), KEY `chrom` (`chrom`(7),`txStart`), KEY `chrom_2` (`chrom`(7),`txEnd`), KEY `geneName` (`geneName`(10)) ) ENGINE=MyISAM DEFAULT CHARSET=latin1; Grik1 NM_010348 chr16 - 87784753 88179117 87785052 88178561 16 87784753,87803012,87811935,87824756,87829396,87835403,87836652,87838886,87846348,87884773,87895207,87898563,87923013,87940193,87944866,88178443, 87785208,87803263,87812161,87824974,87829515,87835627,87836856,87839000,87846456,87884917,87895381,87898617,87923195,87940451,87945034,88179117, =end def get_refseq_data_by_chr( ucsc_data_dir, refseq_file ) refseq_pathFile = File.join( ucsc_data_dir, refseq_file ) begin rs_rh = File.open( refseq_pathFile ) rescue raise "can't find #{refseq_pathFile}" end nbr_chr22 = 0 transcripts_by_chr_HoAoA = Hash.new n = 0 begin while line = rs_rh.readline.chomp if line =~ /^(\S+)\s+(\S+)\s+(chr\S+)\s+([+|-])\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)/ symbol = $1 id = $2 chr = $3 strand = $4 txStart = $5.to_i txEnd = $6.to_i cdsStart = $7.to_i cdsEnd = $8.to_i exonCount = $9.to_i rec = [txStart,txEnd,strand,id,symbol] if transcripts_by_chr_HoAoA.has_key?(chr) then transcripts_by_chr_HoAoA[chr] << rec else transcripts_by_chr_HoAoA[chr] = [rec] end n += 1; if (n % 10000 == 0) then puts " #{commify(n)}, #{chr}" end end end rescue EOFError rs_rh.close end puts " found #{commify(n)} RefSeq records" return transcripts_by_chr_HoAoA end ################################################################################ =begin uc001jgi.1 chr10 - 49034611 49152723 49035370 49152616 29 49034611,49041376,49046643,49049042,49050994,49052918,49053894,49056095,49058851,49062614,49062822,49063601,49065240,49070732,49079276,49084796,49090002,49100361,49101171,49110163,49114530,49116039,49117653,49118408,49120209,49122832,49127069,49129614,49152591, 49035419,49041720,49046742,49049257,49051142,49053004,49053988,49056200,49059057,49062731,49062930,49063694,49065341,49070943,49079433,49084982,49090158,49100500,49101319,49110338,49114602,49116172,49117741,49118541,49120401,49122898,49127227,49129740,49152723, NP_001018081 uc001jgi.1 ... We load this into a Hash, gnf_gcRMA_H[probe_id] = expression_val, and find no duplicate ID values, so things are simple. We can get an expression value for a probe simply by calling a hash key. =end def get_knownGene_data_by_chr( ucsc_data_dir, knownGene_file, kgXref_HoA ) knownGene_pathFile = File.join( ucsc_data_dir, knownGene_file ) begin kg_rh = File.open( knownGene_pathFile ) rescue raise "can't find #{knownGene_pathFile}" end knownGene_HoA = Hash.new n = 0 begin while line = kg_rh.readline.chomp if line =~ /^(\S+)\s+(chr\S+)\s+([+|-])\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)/ kg_id = $1 chr = $2 strand = $3 txStart = $4.to_i txEnd = $5.to_i cdsStart = $6.to_i cdsEnd = $7.to_i exonCount = $8.to_i kg_symbol = $11 if kgXref_HoA.has_key?(kg_id) then gene_symbol = kgXref_HoA[kg_id] else puts " did not find a key in kgXref_HoA for #{kg_id}" end #symbol = "#{gene_symbol}__#{kg_symbol}" kg_rec = [txStart,txEnd,strand,kg_id,gene_symbol] # refseq: rec = [txStart,txEnd,strand,id,symbol] if knownGene_HoA.has_key?(chr) then knownGene_HoA[chr] << kg_rec else knownGene_HoA[chr] = [kg_rec] end n += 1; if (n % 10000 == 0) then puts " #{n}" end end end rescue EOFError kg_rh.close end puts " found #{commify(n)} knownGene records" return knownGene_HoA end ################################################################################ =begin kgXref.txt should give gene symbols CREATE TABLE `kgXref` ( `kgID` varchar(40) NOT NULL default '', `mRNA` varchar(40) default NULL, `spID` varchar(40) default NULL, `spDisplayID` varchar(40) default NULL, `geneSymbol` varchar(40) default NULL, `refseq` varchar(40) default NULL, `protAcc` varchar(40) default NULL, `description` longblob NOT NULL, KEY `kgID` (`kgID`), KEY `mRNA` (`mRNA`), KEY `spID` (`spID`), KEY `spDisplayID` (`spDisplayID`), KEY `geneSymbol` (`geneSymbol`), KEY `refseq` (`refseq`), KEY `protAcc` (`protAcc`) ) ENGINE=MyISAM DEFAULT CHARSET=latin1; uc009vjg.1 BC048429 BC048429 Homo sapiens cDNA clone IMAGE:5275617, **** WARNING: chimeric clone ****. kgID mRNA spID spDisplayID geneSymbol refseq protAcc description $1 $2 $3 $4 $5 $6 $7 $8 uc001aal.1 NM_001005484 Q8NH21 OR4F5_HUMAN OR4F5 NM_001005484 NP_001005484 olfactory receptor, family 4, subfamily F, =end #def get_kgXref_data( ucsc_data_dir, knownGene_file ) # # kg_xref_pathFile = File.join( ucsc_data_dir, kgxref_file ) # begin # kg_rh = File.open( kg_xref_pathFile ) # rescue # raise "can't find #{kg_xref_pathFile}" # end # # kg_xref_H = Hash.new # [kg_id] = symbol # n = 0 # begin # while line = kg_rh.readline.chomp # # if line =~ /^(\S+)\t([^\t]+)\t([^\t]+)?\t([^\t]+)?\t([^\t]+)\t([^\t]+)?\t([^\t]+)?\t([^\t]+)/ # # kg_id = $1 # mrna = $2 # sp_id = $3 # sp_display_id = $4 # gene_symbol = $5 # refseq = $6 # prot_acc = $7 # description = $8 # # kg_rec = [txStart,txEnd,strand,kg_id,kg_symbol] # # if kg_xref_H.has_key?(kg_id) then # raise "found duplicate key for #{kg_id}" # else knownGene_HoA[kg_id] = gene_symbol # end # # n += 1; if (n % 10000 == 0) then puts " #{n}" end # end # end # rescue EOFError # kg_rh.close # end # # puts " found #{commify(n)} knownGene records" # return knownGene_HoA #end ################################################################################ =begin 30 Sept 2008 uc001aaa.2 BC032353 BC032353 Homo sapiens cDNA FLJ36366 fis, clone THYMU2007824. uc009vip.1 AX748260 AX748260 Homo sapiens cDNA FLJ36366 fis, clone THYMU2007824. uc009vjg.1 BC048429 BC048429 Homo sapiens cDNA clone IMAGE:5275617, **** WARNING: chimeric clone ****. uc001aal.1 NM_001005484 Q8NH21 OR4F5_HUMAN OR4F5 NM_001005484 NP_001005484 olfactory receptor, family 4, subfamily F, uc009vjh.1 AY972817 Q52R92 Q52R92_HUMAN OR4F5 NM_001005484 NP_001005484 olfactory receptor, family 4, subfamily F, uc001aaq.1 DQ599874 DQ599874 DQ599874 uc001aar.1 DQ599768 DQ599768 DQ599768 ... =end def get_kgXref_data( ucsc_data_dir, kgXref_file ) puts " get_kgXref_data( ucsc_data_dir, kgXref_file )" kgXref_pathFile = File.join( ucsc_data_dir, kgXref_file ) begin kx_rh = File.open( kgXref_pathFile ) rescue raise "can't find #{kgXref_pathFile}" end kgXref_HoA = Hash.new kgXref_symbol_to_kg_id_HoA = Hash.new # HoA[symbol] = [kg,kg,kg,...] n = 0 n_dups = 0 begin while line = kx_rh.readline.chomp if line =~ /(\S+)\t([^\t]+)?\t([^\t]+)?\t([^\t]+)?\t([^\t]+)\t([^\t]+)?\t([^\t]+)?\t([^\t]+)/ kg_id = $1 mRNA = $2 spID = $3 spDisplayID = $4 gene_symbol = $5 refseq = $6 protAcc = $7 description = $8 gene_symbol.gsub(/ +/,"_") if kg_id == "uc010nxr.1" || kg_id == "uc010jhd.1" then puts " #{line}" end if kgXref_HoA.has_key?(kg_id) then puts " kgXref: found duplicate known gene ID: #{kg_id}" raise #kgXref_HoA[kg_id] << gene_symbol n_dups += 1 else kgXref_HoA[kg_id] = gene_symbol end if kgXref_symbol_to_kg_id_HoA.has_key?(gene_symbol) then kgXref_symbol_to_kg_id_HoA[gene_symbol] << kg_id #puts " found another gene symbol for #{kg_id}" else kgXref_symbol_to_kg_id_HoA[gene_symbol] = [kg_id] end n += 1; if (n % 10000 == 0) then puts " #{n}" end else puts "did not match #{line}" end end rescue EOFError kx_rh.close end puts " found #{commify(n)} knownGene xRef records" puts " loaded kg Id's for #{commify(kgXref_symbol_to_kg_id_HoA.length)} gene symbols" puts " loaded #{kgXref_HoA.keys.length} 'kg_id' keys" puts " found #{n_dups} repeated symbols for a kg_id" ## Now ensure that symbols are unique #kg_id_keys_A = kgXref_HoA.keys #kg_id_keys_A.each do |k| # list_of_symbols_A = kgXref_HoA[k] # uniq_symbols_A = list_of_symbols_A.uniq # kgXref_HoA[k] = uniq_symbols_A #end return kgXref_HoA, kgXref_symbol_to_kg_id_HoA end ################################################################################ # reads_basefilename = "MM0281.um.eland.noDots.filtered" #.1.reads" # chr1 # reads: 3000965 - 1 def get_reads_data_by_chr( reads_dir, reads_basefilename, chr, xset_len, test_data_or_real_data ) puts "get_reads_data_by_chr( reads_dir, reads_basefilename, chr, xset_len, test_data_or_real_data )" puts " #{chr}" chr_nbr = chr.gsub(/chr/,"") reads_file = "#{reads_basefilename}#{chr_nbr}.reads" #reads_file = "#{chr_nbr}.reads" reads_pathfile = File.join( reads_dir, reads_file ) begin in_rh = File.open(reads_pathfile,'r') rescue raise "could not open #{reads_pathfile}" end xset_recs_on_this_chr_AoA = Array.new nbr_of_reads_read = 0 begin while line = in_rh.readline.chomp if line =~ /(\d+)\s+([+|-])\s+(\d+)/ then raw_start = $1.to_i strand = $2 count = $3.to_i if strand == "+" xset_start = raw_start xset_end = raw_start + xset_len - 1 elsif strand == "-" xset_end = raw_start xset_start = raw_start - xset_len + 1 else raise "L303: did not recognize strand: #{strand}" end xset_recs_on_this_chr_AoA << [ xset_start, xset_end, strand, count ] nbr_of_reads_read += 1 if (nbr_of_reads_read % 200000 == 0) then puts " #{commify(nbr_of_reads_read)}" end end end rescue EOFError in_rh.close puts " #{chr} had #{commify(nbr_of_reads_read)} reads" end return xset_recs_on_this_chr_AoA end #tf_site_pathfile ################################################################################ # NEW: 04 oct 09 # Read a GZIPPED file def get_fp_bed_gz_data_by_chr( bed_gz_dir, bed_gz_basefilename, chr, xset_len, test_data_or_real_data ) puts " #{chr}" #chr_nbr = chr.gsub(/chr/,"") # HS1238, HS1235 bed_gz_file = "#{chr}_#{bed_gz_basefilename}" bed_gz_pathfile = File.join( bed_gz_dir, bed_gz_file ) puts " Opening #{bed_gz_pathfile}" if File.exists?(bed_gz_pathfile) then puts " Found #{bed_gz_pathfile}" else if File.exists?(bed_gz_dir) then puts " Found #{bed_gz_dir}" else puts " Could not find directory #{bed_gz_dir}" end end #begin # in_rh = File.open(reads_pathfile,'r') #rescue # raise "could not open #{reads_pathfile}" #end xset_recs_on_this_chr_AoA = Array.new nbr_of_reads_read = 0 # --------------------------------------------------------------------------- begin File.open(bed_gz_pathfile) do |f| gzip = Zlib::GzipReader.new(f) data = gzip.read.split(/\n/) puts " data have #{commify(data.length)} records" n = 0 data.each do |line| if line =~ /^track/ puts " #{line}" next end # chr22 46246687 46246738 CTACAAGATTAGAAATTACAGTTTAGGGATCATGCAAGCCTCTGGCTCCa 0 + 46246687 46246738 255,0,0 # $1 $2 $3 $4 $5 $6 if line =~ /^(chr\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+([+\-])/ then chr = $1 gstart = $2.to_i gend = $3.to_i strand = $6 if strand == "+" then xset_start = gstart xset_end = gstart + xset_len - 1 elsif strand == "-" then xset_start = gend - xset_len + 1 xset_end = gend else raise "L1003: did not recognize strand: #{strand}" end rec = [ xset_start, xset_end, strand ] xset_recs_on_this_chr_AoA << rec nbr_of_reads_read += 1 if (nbr_of_reads_read % 50000 == 0) then puts " #{commify(nbr_of_reads_read)}" end end # if line end # do |line| end rescue end puts " #{chr} had #{commify(nbr_of_reads_read)} reads" return xset_recs_on_this_chr_AoA end ################################################################################ # chr3 8228272 8228273 6 # $1 $2 $3 $4 # 24may09 - This should not automatically handle both 3- and 4-field TF records def get_tf_sites_by_chr( tf_site_pathfile ) begin in_rh = File.open(tf_site_pathfile,'r') rescue raise "could not open #{tf_site_pathfile}" end tf_sites_by_chr_HoAoA = Hash.new nbr_of_tf_sites_read = 0 begin while line = in_rh.readline.chomp ### Brad's 'new' format - 4 fields #################################### if line =~ /^(chr\S+)\s+(\d+)\s+(\d+)\s+(\d+)$/ then tgt_chr = $1 tgt_start = $2.to_i tgt_end = $3.to_i tgt_id = $4.to_i # There might be a 'strand' in the future tgt_rec_A = [ tgt_start, tgt_id ] if tf_sites_by_chr_HoAoA.has_key?(tgt_chr) then tf_sites_by_chr_HoAoA[tgt_chr] << tgt_rec_A else tf_sites_by_chr_HoAoA[tgt_chr] = [tgt_rec_A] end nbr_of_tf_sites_read += 1 if (nbr_of_tf_sites_read % 10000 == 0) then puts " #{commify(nbr_of_tf_sites_read)}" end #if tgt_score > 200 then puts " [#{nbr_of_tf_sites_read}] #{tgt_score}" end ### Brad's 'old' format - 3 fields elsif line =~ /^(chr\S+)\s+(\d+)\s+(\d+)$/ then tgt_chr = $1 tgt_start = $2.to_i tgt_id = $3.to_i # There might be a 'strand' in the future tgt_rec_A = [ tgt_start, tgt_id ] if tf_sites_by_chr_HoAoA.has_key?(tgt_chr) then tf_sites_by_chr_HoAoA[tgt_chr] << tgt_rec_A else tf_sites_by_chr_HoAoA[tgt_chr] = [tgt_rec_A] end nbr_of_tf_sites_read += 1 if (nbr_of_tf_sites_read % 10000 == 0) then puts " #{commify(nbr_of_tf_sites_read)}" end #if tgt_score > 200 then puts " [#{nbr_of_tf_sites_read}] #{tgt_score}" end ####################################################################### ### Standard FP2 format with subpeak ID: 1 3 3000592 3001087 206 12 23 # Corrected FP2 subpeaks, 19june09 elsif line =~ /^(\d+)\s(\S+)\t(\d+)\t(\d+)\t(\d+)\s(\d+)\s(\d+)$/ # id chr start end maxOffset ht reads # $1 $2 $3 $4 $5 $6 $7 tgt_id = $1.to_i tgt_chrom = $2 tgt_start = $3.to_i tgt_end = $4.to_i tgt_max_offset = $5.to_i tgt_chr = "chr#{tgt_chrom}" tgt_coord = tgt_start + tgt_max_offset - 1 tgt_rec_A = [ tgt_coord, tgt_id ] if tf_sites_by_chr_HoAoA.has_key?(tgt_chr) then tf_sites_by_chr_HoAoA[tgt_chr] << tgt_rec_A else tf_sites_by_chr_HoAoA[tgt_chr] = [tgt_rec_A] end nbr_of_tf_sites_read += 1 if (nbr_of_tf_sites_read % 10000 == 0) then puts " #{commify(nbr_of_tf_sites_read)} FP2 records" end #if tgt_score > 200 then puts " [#{nbr_of_tf_sites_read}] #{tgt_score}" end ####################################################################### # 26 Sept 2010 # FindPeaks differential enrichment coordinate call elsif line =~ /(chr\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)/ # chr1 1667520 1434 1581 13.3042 1.0 0.0 # $1 $2 $3 $4 $5 $6 $7 tgt_chr = $1 tgt_coord = $2.to_i tgt_id = $1 + "_" + $2 tgt_pval = $7.to_f tgt_rec_A = [ tgt_coord, tgt_id ] if tf_sites_by_chr_HoAoA.has_key?(tgt_chr) then tf_sites_by_chr_HoAoA[tgt_chr] << tgt_rec_A else tf_sites_by_chr_HoAoA[tgt_chr] = [tgt_rec_A] end nbr_of_tf_sites_read += 1 if (nbr_of_tf_sites_read % 1000 == 0) then puts " #{commify(nbr_of_tf_sites_read)} DE records" end else ################################################################# puts line raise "L1414: did not recognize TF file format" end end rescue EOFError in_rh.close puts " read #{commify(nbr_of_tf_sites_read)} TF site recs" end return tf_sites_by_chr_HoAoA end ################################################################################ # chr3 8228272 8228273 6 # $1 $2 $3 $4 # 24may09 - This should not automatically handle both 3- and 4-field TF records def get_de_sites_by_chr( de_coords_pathfile ) begin in_rh = File.open(de_coords_pathfile,'r') rescue raise "could not open #{de_coords_pathfile}" end de_coords_by_chr_HoAoA = Hash.new nbr_of_de_coords_read = 0 begin while line = in_rh.readline.chomp ####################################################################### # 26 Sept 2010 # FindPeaks differential enrichment coordinate call if line =~ /(chr\S+)\s+(\d+)\s+(\d+)\s+(-?\d+)\s+(\S+)\s+(\S+)\s+(\S+)/ # chr1 1667520 1434 1581 13.3042 1.0 0.0 # $1 $2 $3 $4 $5 $6 $7 tgt_chr = $1 tgt_coord = $2.to_i tgt_id = $1 + "_" + $2 tgt_pval = $7.to_f tgt_rec_A = [ tgt_coord, tgt_id ] if de_coords_by_chr_HoAoA.has_key?(tgt_chr) then de_coords_by_chr_HoAoA[tgt_chr] << tgt_rec_A else de_coords_by_chr_HoAoA[tgt_chr] = [tgt_rec_A] end nbr_of_de_coords_read += 1 if (nbr_of_de_coords_read % 1000 == 0) then puts " #{commify(nbr_of_de_coords_read)} DE records" end else ################################################################# puts line raise "L1537: did not recognize DE file format" end end rescue EOFError in_rh.close puts " read #{commify(nbr_of_de_coords_read)} DE-coord recs" end return de_coords_by_chr_HoAoA end ################################################################################ def generate_nonTSS_profile_for_overlapped_xsets(chr, run, nbr_tgt_recs_processed_on_this_chr, tgt_coord, xset_len, window_half_width, xset_recs_on_this_chr_AoA, nbr_target_recs_on_this_chr) # Generate a window around this site, using absolute coords rgn_start = tgt_coord - window_half_width rgn_end = tgt_coord + window_half_width - 1 rgn_len = rgn_end - rgn_start + 1 # Probably smart to check overlap against a rgn that has been *extended* by XSET # length on either end, so that we only need to check for fully-contained overlap, # and can ignore checking for partial overlap. # Then we can just take a slice out of the extended profile that corresponds # to the central region. extended_rgn_start = rgn_start - xset_len + 1 extended_rgn_end = rgn_end + xset_len - 1 extended_rgn_len = extended_rgn_end - extended_rgn_start extended_profile_A = Array.new( rgn_len, 0 ) # Initialize array values to 0 # Use a hash that has absolute genomic coordinates as keys (to facilitate updating a profile) extended_profile_H = Hash.new (extended_rgn_start..extended_rgn_end).each { |abs_coord| extended_profile_H[abs_coord] = 0 } # xset_recs_on_this_chr_AoA << [ xset_start, xset_end, strand, count ] # 0 1 2 3 sites_that_overlap_current_rgn_AoA = Array.new xsets_that_overlap_extended_rgn_AoA = xset_recs_on_this_chr_AoA.find_all { |xset_rec_A| (extended_rgn_start <= xset_rec_A[0]) && (extended_rgn_end >= xset_rec_A[1]) } nbr_of_overlapping_xsets = xsets_that_overlap_extended_rgn_AoA.length puts " #{run}: #{chr} (#{nbr_tgt_recs_processed_on_this_chr}/#{nbr_target_recs_on_this_chr}): #{nbr_of_overlapping_xsets} overlapping xsets" if nbr_of_overlapping_xsets > 0 then #sorted_xsets_that_overlap_extended_rgn_AoA = xsets_that_overlap_extended_rgn_AoA.sort_by { |xset_rec| [xset_recs_on_this_chr_AoA[0]] } xsets_that_overlap_extended_rgn_AoA.each do |xset_rec_A| xset_abs_start = xset_rec_A[0] xset_abs_end = xset_rec_A[1] (xset_abs_start..xset_abs_end).each { |coord| extended_profile_H[coord] += 1 } end else #puts " #{current_site_motif_name} (#{current_site_motif_id}): overlaps no other site" end # Now extract the profile height array extracted_profile_hts_A = Array.new (rgn_start..rgn_end).each { |abs_coord| extracted_profile_hts_A << extended_profile_H[abs_coord] } return extracted_profile_hts_A end ################################################################################ # 17 Oct. 09, needed for long regions. Not implemented yet. def check_whether_region_overlaps_a_nearby_gene() end ################################################################################ # This does strand-corrected profiles around TSS's. # Last modified 10 May 2009. def generate_TSS_profile_for_overlapped_xsets(chr, run, nbr_tgt_recs_processed_on_this_chr, tgt_coord, tgt_strand, xset_len, window_half_width, xset_recs_on_this_chr_AoA, nbr_target_recs_on_this_chr) # Generate a window around this site, using absolute coords rgn_start = tgt_coord - window_half_width rgn_end = tgt_coord + window_half_width - 1 rgn_len = rgn_end - rgn_start + 1 # Probably smart to check overlap against a rgn that has been *extended* by XSET # length on either end, so that we only need to check for fully-contained overlap, # and can ignore checking for partial overlap. # Then we can just take a slice out of the extended profile that corresponds # to the central region. extended_rgn_start = rgn_start - xset_len + 1 extended_rgn_end = rgn_end + xset_len - 1 extended_rgn_len = extended_rgn_end - extended_rgn_start extended_profile_A = Array.new( rgn_len, 0 ) # Initialize array values to 0 # Use a hash that has absolute genomic coordinates as keys (to facilitate updating a profile) extended_profile_H = Hash.new (extended_rgn_start..extended_rgn_end).each { |abs_coord| extended_profile_H[abs_coord] = 0 } # xset_recs_on_this_chr_AoA << [ xset_start, xset_end, strand, count ] # 0 1 2 3 sites_that_overlap_current_rgn_AoA = Array.new xsets_that_overlap_extended_rgn_AoA = xset_recs_on_this_chr_AoA.find_all { |xset_rec_A| (extended_rgn_start <= xset_rec_A[0]) && (extended_rgn_end >= xset_rec_A[1]) } nbr_of_overlapping_xsets = xsets_that_overlap_extended_rgn_AoA.length puts " #{run}: #{chr} (#{nbr_tgt_recs_processed_on_this_chr}/#{nbr_target_recs_on_this_chr}): #{nbr_of_overlapping_xsets} overlapping xsets" if nbr_of_overlapping_xsets > 0 then #sorted_xsets_that_overlap_extended_rgn_AoA = xsets_that_overlap_extended_rgn_AoA.sort_by { |xset_rec| [xset_recs_on_this_chr_AoA[0]] } xsets_that_overlap_extended_rgn_AoA.each do |xset_rec_A| xset_abs_start = xset_rec_A[0] xset_abs_end = xset_rec_A[1] (xset_abs_start..xset_abs_end).each { |coord| extended_profile_H[coord] += 1 } end else #puts " #{current_site_motif_name} (#{current_site_motif_id}): overlaps no other site" end # Now extract the profile height array extracted_profile_hts_A = Array.new (rgn_start..rgn_end).each { |abs_coord| extracted_profile_hts_A << extended_profile_H[abs_coord] } #strand_corrected_profile_hts_A = Array.new if tgt_strand == "+" then strand_corrected_profile_hts_A = extracted_profile_hts_A elsif tgt_strand == "-" then strand_corrected_profile_hts_A = extracted_profile_hts_A.reverse else raise "L911: did not recognize strand #{tgt_strand}" end return strand_corrected_profile_hts_A end ################################################################################ # This is for strand-corrected =asymmetric= profiles around TSS's. # This is new and is not working yet. # It should return stranded i.e. 5'-to-3'-oriented profiles. # We should also check whether this region overlaps the end of any other gene/transcript. (and respond when it does) # Last modified 17 October 2009. def generate_asymmetric_TSS_profile_for_overlapped_xsets( chr, run, nbr_tgt_recs_processed_on_this_chr, tgt_coord, tgt_strand, tgt_id, tgt_symbol, xset_len, # window_half_width, window_5pr_width, window_3pr_width, xset_recs_on_this_chr_AoA, nbr_target_recs_on_this_chr, transcript_recs_on_this_chr_AoA ) # Generate a window around this site, using absolute coords # Probably smart to check overlap against a rgn that has been *extended* by XSET # length on either end, so that we only need to check for fully-contained overlap, # and can ignore checking for partial overlap. # Then we can just take a slice out of the extended profile that corresponds # to the central region. # What we should do: # 1. Generate a 'naive' sequence region with a naiveStart (left end) and naiveEnd (right end). # 'Naive' start and end are as they'd appear at the UCSC genome browser, and is the convention used at UCSC for gene coordinates. # 5'-|---------------------->-----|-3' (+) # 3'-|-----<----------------------|-5' (-) # 2. Calculate the profile for the naive region, regardless of the target strand. # 3. If it's a (-)-strand target, reverse the region list just before returning it. This makes it 5'-to-3' oriented. # Reversed (-)-strand profile that is returned # 5'-|---------------------->-----|-3' (-/reversed) case tgt_strand when "+" naive_rgn_start = tgt_coord - window_5pr_width + 1 naive_rgn_end = tgt_coord + window_3pr_width rgn_len = naive_rgn_end - naive_rgn_start + 1 extended_rgn_start = naive_rgn_start - xset_len extended_rgn_end = naive_rgn_end + xset_len extended_rgn_len = extended_rgn_end - extended_rgn_start when "-" naive_rgn_end = tgt_coord + window_5pr_width naive_rgn_start = tgt_coord - window_3pr_width + 1 rgn_len = naive_rgn_end - naive_rgn_start + 1 extended_rgn_end = naive_rgn_end + xset_len extended_rgn_start = naive_rgn_start - xset_len extended_rgn_len = extended_rgn_start - extended_rgn_end end # Overlap has many cases!! # RefSeq transcripts: rec = [txStart,txEnd,strand,id,symbol] transcripts_that_overlap_extended_rgn_AoA = transcript_recs_on_this_chr_AoA.find_all { |tscpt_rec_A| (extended_rgn_start <= tscpt_rec_A[0]) && (extended_rgn_end >= tscpt_rec_A[1]) } #--- OLD code --------------------------------------------------------------- extended_profile_A = Array.new( rgn_len, 0 ) # Initialize array values to 0 # Use a hash that has absolute genomic coordinates as keys (to facilitate updating a profile) extended_profile_H = Hash.new (extended_rgn_start..extended_rgn_end).each { |abs_coord| extended_profile_H[abs_coord] = 0 } # xset_recs_on_this_chr_AoA << [ xset_start, xset_end, strand, count ] # 0 1 2 3 sites_that_overlap_current_rgn_AoA = Array.new xsets_that_overlap_extended_rgn_AoA = xset_recs_on_this_chr_AoA.find_all { |xset_rec_A| (extended_rgn_start <= xset_rec_A[0]) && (extended_rgn_end >= xset_rec_A[1]) } nbr_of_overlapping_xsets = xsets_that_overlap_extended_rgn_AoA.length puts " #{run}: #{chr} (#{nbr_tgt_recs_processed_on_this_chr}/#{nbr_target_recs_on_this_chr}) #{tgt_id} #{tgt_symbol} (#{tgt_strand}): #{nbr_of_overlapping_xsets} overlapping xsets" if nbr_of_overlapping_xsets > 0 then #sorted_xsets_that_overlap_extended_rgn_AoA = xsets_that_overlap_extended_rgn_AoA.sort_by { |xset_rec| [xset_recs_on_this_chr_AoA[0]] } xsets_that_overlap_extended_rgn_AoA.each do |xset_rec_A| xset_abs_start = xset_rec_A[0] xset_abs_end = xset_rec_A[1] (xset_abs_start..xset_abs_end).each { |coord| extended_profile_H[coord] += 1 } end else #puts " #{current_site_motif_name} (#{current_site_motif_id}): overlaps no other site" end # Now extract the profile height array extracted_profile_hts_A = Array.new (naive_rgn_start..naive_rgn_end).each { |abs_coord| extracted_profile_hts_A << extended_profile_H[abs_coord] } #strand_corrected_profile_hts_A = Array.new if tgt_strand == "+" then strand_corrected_profile_hts_A = extracted_profile_hts_A elsif tgt_strand == "-" then strand_corrected_profile_hts_A = extracted_profile_hts_A.reverse else raise "L911: did not recognize strand #{tgt_strand}" end return strand_corrected_profile_hts_A #--- OLD code --------------------------------------------------------------- end ################################################################################ def get_hg18_chr_list() chrs_A = [ "chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX" #"chrM", #"chrX" #, #"chrY" ] chrom_lens_H = { "chr1" => 247249719, "chr2" => 242951149, "chr3" => 199501827, "chr4" => 191273063, "chr5" => 180857866, "chr6" => 170899992, "chr7" => 158821424, "chr8" => 146274826, "chr9" => 140273252, "chr10" => 135374737, "chr11" => 134452384, "chr12" => 132349534, "chr13" => 114142980, "chr14" => 106368585, "chr15" => 100338915, "chr16" => 88827254, "chr17" => 78774742, "chr18" => 76117153, "chr19" => 63811651, "chr20" => 62435964, "chr21" => 46944323, "chr22" => 49691432, "chrX" => 154913754 #, #"chrM" => 16571, #"chrY" => 57772954 } puts "...hg18 chromInfo OK" return chrs_A,chrom_lens_H end def get_mm9_chr_list() chrs_A = [ "chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chrX" #"chrM", #"chrY" ] chrom_lens_H = { "chr1" => 197195432, "chr2" => 181748087, "chr3" => 159599783, "chr4" => 155630120, "chr5" => 152537259, "chr6" => 149517037, "chr7" => 152524553, "chr8" => 131738871, "chr9" => 124076172, "chr10" => 129993255, "chr11" => 121843856, "chr12" => 121257530, "chr13" => 120284312, "chr14" => 125194864, "chr15" => 103494974, "chr16" => 98319150, "chr17" => 95272651, "chr18" => 90772031, "chr19" => 61342430, "chrX" => 166650296, #["chrY" => 15902555], #["chrM" => 16299], } return chrs_A,chrom_lens_H end def get_mm8_chr_list() chrs_A = [ "chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chrX" #"chrM", #"chrY" ] chrom_lens_H = { "chr1" => 197069962, "chr2" => 181976762, "chr3" => 159872112, "chr4" => 155029701, "chr5" => 152003063, "chr6" => 149525685, "chr7" => 145134094, "chr8" => 132085098, "chr9" => 124000669, "chr10" => 129959148, "chr11" => 121798632, "chr12" => 120463159, "chr13" => 120614378, "chr14" => 123978870, "chr15" => 103492577, "chr16" => 98252459, "chr17" => 95177420, "chr18" => 90736837, "chrX" => 165556469, "chrY" => 16029404, "chrM" => 16299 } return chrs_A,chrom_lens_H end ################################################################################ ### MAIN ####################################################################### puts "1. Get chromInfo" #------------------------------------------------------------------------------- case assembly when "mm8" puts "getting mm8 chromInfo" chrs_A,chrom_lens_H = get_mm8_chr_list() when "mm9" puts "getting mm9 chromInfo" chrs_A,chrom_lens_H = get_mm9_chr_list() when "hg18" puts "getting hg18 chromInfo" chrs_A,chrom_lens_H = get_hg18_chr_list() end #------------------------------------------------------------------------------- #puts "\n1. get_gnf_gcRMA_CD4pTcell_expression_data()" #gnf_gcRMA_H = get_gnf_gcRMA_CD4pTcell_expression_data( su_data_dir, gnf_gcRMA_CD4pTcell_file ) # gnf_gcRMA_H[probe_id] = expression_val case target_type when "tss" #---------------------------------------------------------------------------- #puts "\n2. get_knownGene_data()" #knownGene_HoA = get_knownGene_data_by_chr( ucsc_data_dir, knownGene_file ) # kg_rec = [txStart,txEnd,strand,kg_id,kg_symbol] # kgXref_symbol_to_kg_id_HoA[gene_symbol] = [kg_id,kg_id,kg_id,...] #---------------------------------------------------------------------------- #puts "\n3. get_kgXref_data()" #kgXref_HoA, kgXref_symbol_to_kg_id_HoA = get_kgXref_data( ucsc_data_dir, kgXref_file ) # kgXref_H[kg_id] = geneSymbol puts "\n2. Get RefSeq or knownGene transcripts" case gene_type when "refFlat.txt" puts "refFlat.txt" #---------------------------------------------------------------------------- transcripts_by_chr_HoAoA = get_refseq_data_by_chr( ucsc_data_dir, transcript_file ) #---------------------------------------------------------------------------- # rec = [txStart,txEnd,strand,id,symbol] when "knownGene.txt" puts "knownGene.txt" #---------------------------------------------------------------------------- kgXref_HoA, kgXref_symbol_to_kg_id_HoA = get_kgXref_data( ucsc_data_dir, kgXref_file ) # return kgXref_HoA, kgXref_symbol_to_kg_id_HoA transcripts_by_chr_HoAoA = get_knownGene_data_by_chr( ucsc_data_dir, transcript_file, kgXref_HoA ) #---------------------------------------------------------------------------- # kg_rec = [txStart,txEnd,strand,kg_id,kg_symbol] end when "tf" puts "\n2. get TF site data" #---------------------------------------------------------------------------- tf_sites_by_chr_HoAoA = get_tf_sites_by_chr( tf_site_pathfile ) #---------------------------------------------------------------------------- # tgt_rec_A = [ tgt_start, tgt_id ] when "de_coords" puts "\n2. Get DE coord data" #---------------------------------------------------------------------------- de_coords_by_chr_HoAoA = get_de_sites_by_chr( de_coords_pathfile ) #---------------------------------------------------------------------------- # tgt_rec_A = [ tgt_start, tgt_id ] end puts "\n4. get reads data, per chromosome" #reads_chrs_A.each do |chr| chrs_A.each do |chr| case data_type when "reads" # For either genes/TSS's or TF sites as targets #---------------------------------------------------------------------------- # Get reads for this chr reads_basefilename = "MM0281.um.eland.noDots.filtered" #.1.reads" xset_recs_on_this_chr_AoA = get_reads_data_by_chr( reads_dir, reads_basefilename, chr, xset_len, test_data_or_real_data ) #---------------------------------------------------------------------------- when "fp_bed_gz" #---------------------------------------------------------------------------- # chr1_HS1238_H4ac_hg18_maq.bed.gz # gunzip # Then: xset_recs_on_this_chr_AoA = get_fp_bed_gz_data_by_chr( bed_gz_dir, bed_gz_basefilename, chr, xset_len, test_data_or_real_data ) #gzip #---------------------------------------------------------------------------- end # Now choose transcript TSSs or TF sites or DE-coords ####################### case target_type ############################################################################# when "tss" # Get all target locations and strands on this chr transcript_recs_on_this_chr_AoA = transcripts_by_chr_HoAoA[chr] nbr_target_recs_on_this_chr = transcript_recs_on_this_chr_AoA.length puts "\n #{reads_label}: #{chr} has #{nbr_target_recs_on_this_chr} records" # Now process each transcript record, one at a time nbr_tgt_recs_processed_on_this_chr = 0 transcript_recs_on_this_chr_AoA.each do |tgt_rec| nbr_tgt_recs_processed_on_this_chr += 1 # rec = [txStart,txEnd,strand,id,symbol] RefSeq, 10 May 2009 # 0 1 2 3 4 tgt_id = tgt_rec[3] tgt_symbol = tgt_rec[4] tgt_strand = tgt_rec[2] if tgt_strand == "+" then tgt_coord = tgt_rec[0] elsif tgt_strand == '-' then tgt_coord = tgt_rec[1] else raise "did not recognize strand #{strand}" end # 10 May 2009 - this 'if' lets us debug results # Comment it out for an actual run. tgt_ids_A = %w[ NM_009460 NM_018868 NM_007561 NM_001045513 ] #if tgt_ids_A.include?(tgt_id) then #puts "\ngenerating a profile for #{tgt_id}" strand_corrected_profile_hts_A = Array.new #---------------------------------------------------------------------- case window when "symmetric" #strand_corrected_profile_hts_A = generate_TSS_profile_for_overlapped_xsets( # chr, run, # nbr_tgt_recs_processed_on_this_chr, # tgt_coord, tgt_strand, # xset_len, window_half_width, # xset_recs_on_this_chr_AoA, # nbr_target_recs_on_this_chr # ) when "asymmetric" strand_corrected_profile_hts_A = generate_asymmetric_TSS_profile_for_overlapped_xsets( chr, run, nbr_tgt_recs_processed_on_this_chr, tgt_coord, tgt_strand, tgt_id, tgt_symbol, xset_len, # window_half_width, window_5pr_width, window_3pr_width, xset_recs_on_this_chr_AoA, nbr_target_recs_on_this_chr, transcript_recs_on_this_chr_AoA ) end #---------------------------------------------------------------------- area_under_the_profile = 0 strand_corrected_profile_hts_A.each { |ht| area_under_the_profile += ht } center_of_strand_corrected_profile_hts_A = strand_corrected_profile_hts_A[999..2999] area_under_the_centre_of_the_profile = 0 center_of_strand_corrected_profile_hts_A.each { |ht| area_under_the_centre_of_the_profile += ht } case test_data_or_real_data when "real_data" profile_wh.puts [ tgt_id, tgt_symbol, chr, tgt_coord, tgt_strand, area_under_the_profile, area_under_the_centre_of_the_profile, strand_corrected_profile_hts_A.join("\t") ].join("\t") when "test_data" profile_wh.puts strand_corrected_profile_hts_A.join("\n") end #end # End of debugging loop end ############################################################################# when "tf" # Get all target locations and strands on this chr target_recs_on_this_chr_AoA = tf_sites_by_chr_HoAoA[chr] nbr_target_recs_on_this_chr = target_recs_on_this_chr_AoA.length puts "\n #{reads_label}: #{chr} has #{nbr_target_recs_on_this_chr} TF sites" nbr_tgt_recs_processed_on_this_chr = 0 target_recs_on_this_chr_AoA.each do |tgt_rec_A| ######################### nbr_tgt_recs_processed_on_this_chr += 1 # tgt_rec_A << [ tgt_start, tgt_id ] # 0 1 tgt_start = tgt_rec_A[0].to_i tgt_id = tgt_rec_A[1].to_i tgt_strand = "+" # Currently fixed to '+' strand if tgt_strand == "+" then tgt_coord = tgt_rec_A[0] elsif tgt_strand == '-' raise "have only '-' strand TF sites for now" else raise "did not recognize strand #{tgt_strand}" end #---------------------------------------------------------------------- case window when "symmetric" # This is what is already working profile_heights_A = generate_nonTSS_profile_for_overlapped_xsets( chr, run, nbr_tgt_recs_processed_on_this_chr, tgt_coord, xset_len, window_half_width, xset_recs_on_this_chr_AoA, nbr_target_recs_on_this_chr ) when "asymmetric" # This is new and is not working yet. It should return strand-oriented (5'-to-3') profiles. profile_heights_A = generate_asymmetric_TSS_profile_for_overlapped_xsets( chr, run, nbr_tgt_recs_processed_on_this_chr, tgt_coord, tgt_strand, xset_len, # window_half_width, window_5pr_width, window_3pr_width, xset_recs_on_this_chr_AoA, nbr_target_recs_on_this_chr) end #---------------------------------------------------------------------- area_under_the_profile = 0 profile_heights_A.each { |ht| area_under_the_profile += ht } case test_data_or_real_data when "real_data" profile_wh.puts [ tgt_id, ".", chr, tgt_coord, "+", area_under_the_profile, profile_heights_A.join("\t") ].join("\t") when "test_data" profile_wh.puts profile_heights_A.join("\n") end end ############################################################################# when "de_coords" # Get all target locations and strands on this chr # get_de_sites_by_chr( de_coords_pathfile ) target_recs_on_this_chr_AoA = de_coords_by_chr_HoAoA[chr] nbr_target_recs_on_this_chr = target_recs_on_this_chr_AoA.length puts "\n #{reads_label}: #{chr} has #{nbr_target_recs_on_this_chr} DE coords" nbr_tgt_recs_processed_on_this_chr = 0 target_recs_on_this_chr_AoA.each do |tgt_rec_A| ######################### nbr_tgt_recs_processed_on_this_chr += 1 # tgt_rec_A = [ tgt_coord, tgt_id ] # 0 1 tgt_coord = tgt_rec_A[0].to_i tgt_id = tgt_rec_A[1] #tgt_strand = "+" # Currently fixed to '+' strand # #if tgt_strand == "+" then # tgt_coord = tgt_rec_A[0] #elsif tgt_strand == '-' # raise "have only '-' strand DE sites for now" #else # raise "did not recognize strand #{tgt_strand}" #end #---------------------------------------------------------------------- case window when "symmetric" # This is what is already working profile_heights_A = generate_nonTSS_profile_for_overlapped_xsets( chr, run, nbr_tgt_recs_processed_on_this_chr, tgt_coord, xset_len, window_half_width, xset_recs_on_this_chr_AoA, nbr_target_recs_on_this_chr ) when "asymmetric" # This is new and is not working yet. It should return strand-oriented (5'-to-3') profiles. profile_heights_A = generate_asymmetric_TSS_profile_for_overlapped_xsets( chr, run, nbr_tgt_recs_processed_on_this_chr, tgt_coord, tgt_strand, xset_len, # window_half_width, window_5pr_width, window_3pr_width, xset_recs_on_this_chr_AoA, nbr_target_recs_on_this_chr) end #---------------------------------------------------------------------- area_under_the_profile = 0 profile_heights_A.each { |ht| area_under_the_profile += ht } case test_data_or_real_data when "real_data" profile_wh.puts [ tgt_id, ".", chr, tgt_coord, "+", area_under_the_profile, profile_heights_A.join("\t") ].join("\t") when "test_data" profile_wh.puts profile_heights_A.join("\n") end end end end profile_wh.close log_wh.close puts "\ndone! See: #{out_pathfile}"