#!/usr/bin/env ruby

script_name = "frag_profiler_with_area.rb"
version =     "26-Sept-2010:16h00"

date = "2010-09-26"

=begin

INTENT
1. Generate per-location profiles for XSET overlap in symmetric or asymmetric windows
   around e.g. peak maxes, TF sites, TSS's, DE coordinates.
   As in Heintzmann Nature 2009, Figs. 1, 2.

NEEDS
1. 'reads' files, per-chr.
OR
FindPeaks converted BED file: (from a MAP file)

track name="chr22_HS1238_H4ac_hg18 Sequences" description="Illumina maq-aligned reads - HS1238_H4ac_hg18" visibility=4 priority=10 itemRgb="On"
chr22   14430607        14430658        TtACTGtGTCAGAACAGAGTGTGCCGATTGTGGTCAGGACTCCATAGCAT      0       -       14430607        14430658        0,0,255
chr22   14430930        14430981        TGGGGGCCAAGGCCCTTGTTATGGGGAtGAAGGCTCTTAGGTGGTAGCCC      0       -       14430930        14430981        0,0,255
chr22   14430935        14430986        GCCAAGGCCcTTGtTaTGGGGATGaAGGctctTagGtGGtAGcCcTccaa      0       +       14430935        14430986        255,0,0


NB
1. We assume that 'reads' 'starts' are frag-starts that include directionality.
I.e. a (+)-strand read's start is the frag's left end, and (-) is the right end.

STATUS
26Sep2010 - Added FindPeaks4 DE call records, for Iva, HS1238/5 H4ac.
17may2010 - Foxa2, E14.5 liver, for Olivia
12feb2010 - MafA, MM0555-1Lane
05nov09 - Added UCSC knownGene transcripts (66803) instead of RefSeq transcripts. Seems OK.
17oct09 - Added asymmetric regions, e.g. -2kb/+500bp.
          Need to add region-gene-overlap detector.
05oct09 - Run chr22 locally for HS1238 and HS1235, for +/-2kb on REfSeq TSS's.
          Running on xhost06.
04oct09 - Run from a FP converted Maq to BED file, gzipped.
24mai09 - Ran new TF lists and a RefSeq list for Brad. Adjusted the TF record for ID=$4 rather than $3.
          This should not automatically handle both 3- and 4-field TF records.
12mai09 - Adjusted the configuration section for TF-site-based runs. Running on xhost06.
10mai09 - Now can do strand-corrected TSS-based profiles. Verified chr1 K4me3.
09mai09 - Adjusted profile subroutine, checked it, seems better.
04mai09 - Did a Foxa2-pkMax/Foxa2-reads test run. Brad's suggestion. mm8.
02mai09 - Runs. Semi-debugged with 2 small test cases.

COMMENTS
1. For a TSS profile, for (-) strand, we generate a standard (+)-strand profile vector
then reverse it and return it. This is simple to implement but only works for a symmetric window (which is what we have now).
2. The 'standard' TF list from Brad has his peak 'ID' as the fourth column. This ID should be retained in the profiles generated.

TODO
1. Check strand vs. start coord for 'reads' files.
2. A separate script (flexibly) applies binning to reduce the file size.
3. Need to add region-gene-overlap detector.

knownGene.txt
uc001aaa.2	chr1	+	1115	4121	1115	1115	3	1115,2475,3083,	2090,2584,4121,		uc001aaa.2
uc009vip.1	chr1	+	1115	4272	1115	1115	2	1115,2475,	2090,4272,		uc009vip.1
uc001aab.2	chr1	-	4268	14764	4268	4268	10	4268,4832,5658,6469,6716,7095,7468,7777,8130,14600,	4692,4901,5810,6628,6918,7231,7605,7924,8242,14764,		uc001aab.2
...

=end

require 'zlib'

#===========================

#assembly = "mm8"
#assembly = "mm9"
assembly = "hg18"

#target_type = "tss"
#target_type = "tf"
#target_type = "tf_pred_sites"
target_type = "de_coords"

#test_data_or_real_data = "test_data"
test_data_or_real_data = "real_data"

#gene_type = "refFlat.txt"
#gene_type = "knownGene.txt"

data_type = "reads"
#data_type = "fp_bed_gz"

#host = "laptop"
host = "xhost"

#calc = "profile_only"
#calc = "area_only"
#calc = "profile_and_area"

reads_chrs_A = %w[ chr22 ]
#reads_chrs_A = %w[ chr1 chr2 chr3 chr4 ]


xset_len = 200

window = "symmetric"
#window = "asymmetric"

case window
when "symmetric"
   window_half_width = 500 #bp
   puts "using a 'symmetric' window: +/-#{500} bp"
   #raise "please change to asymmetric mode and use equal length half-widths"
when "asymmetric"
   window_5pr_width = 2500 #bp
   window_3pr_width = 1500 #bp
   puts "using an 'asymmetric' window: 5':#{window_5pr_width}/3':#{window_3pr_width}bp"
end


puts "xset_len: #{xset_len} bp"
#puts "window_half_width: #{window_half_width} bp"


### Reference targets: TSS's or TF sites #######################################
case target_type
when "tf"
   case test_data_or_real_data
   when "real_data"
      # TF sites as targets to profile around
      case host
         when "laptop"
            tf_site_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/TF_sites"
         when "xhost"
            # Should move this down into each run's 'case' block
            tf_site_dir = "/home/grobertson/Profiles/TF_sites"
      end

   when "test_data"
      case host
         when "laptop"
            tf_site_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/Tests"
            tf_site_file = "tf_sites.test.txt"
         end

      tf_site_pathfile = File.join( tf_site_dir, tf_site_file )
      puts "tf_site_file: #{tf_site_pathfile}"

      reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/Tests"
      reads_basefilename = "test_5."
      reads_label = "test_5"

   end

when "de_coords"  ### 26 September 2010  #######################################

   de_coords_pathfile = "/projects/remc_bigdata/Karsan/HS1238_kd/de-coord_profiles/HS1238_compare_HS1235-alpha0.01_q10_w400_min3_subp0.5_triangle_subpeaks.regions.p1.0e-06.nofilt.20100728.txt"
   puts "using DE-coords: #{de_coords_pathfile}"

end


### SET THIS -------------------------------------------------------------------

### For Brad - mismatched - mouse - vs predicted Pdx1 binding sites, not peak maxes
#run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0281-H3K4me3-adult-liver"
# /projects/03/genereg/projects/SOLEXA/H3K4me3/MM0281/mm9/eland/reads
# MM0281.um.eland.noDots.filtered.1.reads

#run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0282-H3K4me1-adult-liver"
# /projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm9/eland/reads
#MM0282.um.eland.noDots.filtered.1.reads

#run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0310-H4ac-adult-liver"
# /projects/03/genereg/projects/SOLEXA/H4ac/MM0310/mm9/eland/reads
# MM0310.um.eland.noDots.filtered.1.reads

#run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0411-H3K27me3-adult-liver"
# /projects/03/genereg/projects/SOLEXA/H3K27me3/MM0411/mm9/export/reads
# 1.reads

#run = "MM0388-Pdx1-adult-islets_pred-binding-sites__MM0507-H3K9me3-adult-liver"
# /projects/03/genereg/projects/SOLEXA/H3K9me3/MM0507/mm9/export/reads
# 1.reads


#-------------------------------------------------------------------------------
# H4ac, HS1238, CSL knockdown
# chr1_HS1238_H4ac_hg18_maq.bed.gz
run = "H4ac_HS1238_CSL_knockdown"
#run = "H4ac_HS1235_vectorControl"

# Adult islets
# ------------
# matched
# -------
#run = "PMNrgns__MM0386-H3K4me1-adult-islets"
#run = "MM0555-MafA-1L-2517pks_islets__MM0386-H3K4me1-islets"
#run = "MM0555-MafA-1L-islets__MM0386-H3K4me1-islets"
#run = "MM0325-Foxa2-islets__MM0386-H3K4me1-islets"
#run = "MM0325-Foxa2-islets__MM0387-H3K4me3-islets"
#run = "MM0388-Pdx1-adult-islets__MM0386-H3K4me1-adult-islets"
#run = "MM0388-Pdx1-islets__MM0387-H3K4me3-islets"


# mismatched
# ----------
#run = "MM0325-Foxa2-islets__MM0282-H3K4me1-liver"
#run = "MM0325-Foxa2-islets__MM0281-H3K4me3-liver"

# RefSeq
# ------
#run = "RefSeq__MM0386-H3K4me1-islets"
#run = "RefSeq__MM0387-H3K4me3-islets"

#----------------------------------------------------
# E14.5 liver
#run = "MM0399-Foxa2-E14.5-liver__MM0393-H3K4me1-E14.5_liver"

# Adult liver
# -----------
# matched
# -------
#run = "MM0261-Foxa2-liver__MM0282-H3K4me1-liver"
#run = "MM0261-Foxa2-liver__MM0281-H3K4me3-liver"
#run = "MM0351-Hnf4a-liver__MM0282-H3K4me1-liver"
#run = "MM0351-Hnf4a-liver__MM0281-H3K4me3-liver"

# mismatched
# ----------
#run = "MM0261-Foxa2-liver__MM0386-H3K4me1-islets"
#run = "MM0261-Foxa2-liver__MM0387-H3K4me3-islets"
#run = "MM0261-Foxa2-adult-liver__MM0393-E14.5-H3K4me1-liver"

# RefSeq
# ------
#run = "RefSeq__MM0282-H3K4me1-liver"
#run = "RefSeq__MM0281-H3K4me3-liver"
#----------------------------------------------------

# Stat1 HeLa
#run = "Stat1_HeLa__HS0260-4L_H3K4me1_unstim"
#run = "Stat1_HeLa__HS0262-4L_H3K4me1_stim"

puts run
case run

################################################################################
when "H4ac_HS1238_CSL_knockdown"

   puts "H4ac_HS1238_CSL_knockdown"

   data_type =   "fp_bed_gz"

   if target_type == "tss"  #RefSeq or knownGene TSS's
      case gene_type
         when "refFlat.txt"
            transcript_file = "refFlat.txt"
            case host
               when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/hg18/RefSeq"
               when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/hg18/RefSeq"
            end

         when "knownGene.txt"
            transcript_file = "knownGene.txt"
            kgXref_file =     "kgXref.txt"
            case host
               when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/hg18/knownGene"
               when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/hg18/knownGene"
            end
      end
   end

   reads_label = "HS1238_CSL-kd_hg18"
   bed_gz_basefilename = "HS1238_H4ac_hg18_maq.bed.gz"  # Reads filenames:  "chr22_HS1238_H4ac_hg18_maq.bed.gz"

   case host
      when "laptop" then
         bed_gz_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Karsan_Lab/Data/HS1238/BED"
         out_dir =    "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Karsan_Lab/Data/HS1238/Profiles/20091105"
      when "xhost" then
         #bed_gz_dir = "/projects/mapp_etc/other/HS1238/bed_gz"
         bed_gz_dir = "/projects/remc_bigdata/Karsan/HS1238_kd/bed_gz"
         #out_dir = "/projects/mapp_etc/other/HS1238/profiles/2500-1500bp/knownGene"
         out_dir = "/projects/remc_bigdata/Karsan/HS1238_kd/de-coord_profiles/20100926/plus-minus_500bp"
   end

   # asymmetric
   #puts "window is: (+) 5'-#{window_5pr_width}/(-) +#{window_3pr_width}-3' bp"
   #outfile = "#{run}.5pr-#{window_5pr_width}_to_#{window_3pr_width}-3pr.x#{xset_len}.#{target_type}_profiles.#{date}.a.txt"
   #logfile = "#{run}.5pr-#{window_5pr_width}_to_#{window_3pr_width}-3pr.x#{xset_len}.#{target_type}_profiles.#{date}.a.LOG.txt"

   # symmetric
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}_profiles.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}_profiles.#{date}.LOG.txt"


################################################################################
when "H4ac_HS1235_vectorControl"

   puts "H4ac_HS1235_vectorControl"
   data_type = "fp_bed_gz"

   if target_type == "tss"  #RefSeq TSS's
      case gene_type
         when "refFlat.txt"
            transcript_file = "refFlat.txt"
            case host
               when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/hg18/RefSeq"
               when "xhost" then ucsc_data_dir =  "/home/grobertson/DATA/UCSC/hg18/RefSeq"
            end

         when "knownGene.txt"
            transcript_file = "knownGene.txt"
            kgXref_file =     "kgXref.txt"
            case host
               when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/hg18/knownGene"
               when "xhost" then ucsc_data_dir =  "/home/grobertson/DATA/UCSC/hg18/knownGene"
            end
      end
   end

   reads_label = "HS1235_vectorControl_hg18"
   bed_gz_basefilename = "HS1235_H4ac_hg18_vecCtl_maq.bed.gz"  # Reads filenames:  chr22_HS1235_H4ac_hg18_vecCtl_maq.bed.gz

   case host
      when "laptop" then
         bed_gz_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Karsan_Lab/Data/HS1235_vecCntl/BED"
         out_dir =    "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Karsan_Lab/Data/HS1235_vecCntl/Profiles/20091105"
      when "xhost" then
         #bed_gz_dir = "/projects/mapp_etc/other/HS1235/bed_gz"
         bed_gz_dir = "/projects/remc_bigdata/Karsan/HS1235_ctl/bed_gz"
         #out_dir =    "/projects/mapp_etc/other/HS1235/profiles/2500-1500bp/knownGene"
         out_dir = "/projects/remc_bigdata/Karsan/HS1238_kd/de-coord_profiles/20100926/plus-minus_500bp"
   end

   # asymmetric
   #puts "window is: (+) 5'-#{window_5pr_width}/(-) +#{window_3pr_width}-3' bp"
   outfile = "#{run}.5pr-#{window_5pr_width}_to_#{window_3pr_width}-3pr.x#{xset_len}.#{target_type}_profiles.#{date}.a.txt"
   logfile = "#{run}.5pr-#{window_5pr_width}_to_#{window_3pr_width}-3pr.x#{xset_len}.#{target_type}_profiles.#{date}.a.LOG.txt"

   # symmetric
   #outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}_profiles.#{date}.a.txt"
   #logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}_profiles.#{date}.a.LOG.txt"


################################################################################
# 30 July 2010
when run = "PMNrgns__MM0386-H3K4me1-adult-islets"
   puts run
   tf_site_file = "PMNtripleboundloci_locusmiddle.3cols.20100720.bed" # chr3	5431143	1

   reads_label = "PMNrgns__MM0386_H3K4me1_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm9/export/reads"
   end

   case host
   when "laptop" then out_dir = reads_dir
   when "xhost" then out_dir = "/home/grobertson/Profiles/PMNrgns__MM0386_H3K4me1_adult_islets/mm9/plus-minus-2kb"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


# 17 May 2010
when "MM0399-Foxa2-E14.5-liver__MM0393-H3K4me1-E14.5_liver"

   puts run
   tf_site_file = "Foxa2_E14.5_liver_10847-peakmax.mm8.20100517.txt" # chr3	5431143	1

   reads_label = "MM0393_H3K4me1_E14.5_liver"
   reads_basefilename = ""  # Reads filenames:  1.reads

   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then
         reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0393/mm8/export/reads"
         out_dir =   "/home/grobertson/Profiles/MM0399_Foxa2_E14.5_liver/MM0399_Foxa2-ab_E14.5_liver__MM0393_H3K4me1_mm8/mm8/plus-minus-2kb"
   end

   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
### Mouse adult islets
################################################################################

# 17 Feb 2010 - 2517 filtered peaks, mm9
when "MM0555-MafA-1L-2517pks_islets__MM0386-H3K4me1-islets"

   tf_site_file = "Mafa_2517_ht3_NO_input_NO_tower_peak_max_liftedtomm9_peakmaxcoord.bed" # chr3	5431143	1

   reads_label = "MM0386_H3K4me1_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm9/export/reads"
   end

   case host
   when "laptop" then out_dir = reads_dir
   when "xhost" then out_dir = "/home/grobertson/Profiles/MM0555_MafA_adult-islets/MM0555_MafA-islets__MM0386-H3K4me1-adult-islets/mm9/plus-minus-2kb"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


# 12 Feb 2010
when run = "MM0555-MafA-1L-islets__MM0386-H3K4me1-islets"

   tf_site_file = "MafA_MM0555-1Lane_mm8_peakMaxCoords.txt" # chr3	5431143	1

   reads_label = "MM0386_H3K4me1_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads"
   end

   case host
   when "laptop" then out_dir = reads_dir
   when "xhost" then out_dir = "/home/grobertson/Profiles/MM0555_MafA_adult-islets/MM0555_MafA-islets__MM0386-H3K4me1-adult-islets/mm8/plus-minus-2kb"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


when "MM0325-Foxa2-islets__MM0386-H3K4me1-islets" # matched

   tf_site_file = "Foxa2_Islet_peakmax_mm8.txt" # chr3	5431143	1

   reads_label = "MM0386_H3K4me1_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads"
   end

   case host
   when "laptop" then out_dir = reads_dir
   when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0386-H3K4me1-adult-islets/mm8/plus-minus-10kb"
   #when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0386-H3K4me1-adult-islets/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "MM0325-Foxa2-islets__MM0387-H3K4me3-islets" # matched

   tf_site_file = "Foxa2_Islet_peakmax_mm8.txt" # chr3	5431143	1

   reads_label = "MM0387_H3K4me3_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0387-H3K4me3-adult-islets/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


### Pdx1 - 20 June 2009 ########################################################
when "MM0388-Pdx1-adult-islets__MM0386-H3K4me1-adult-islets"  # matched

   # Corrected FP2 subpeaks, 19june09
   #tf_site_dir = "/projects/03/genereg/projects/SOLEXA/Pdx1/MM0388/mm8/data"
   #tf_site_file = "Pdx1_MM0388_7L_mm8_xset200_dupsN_ht11.sub.peaks"  # FP2: 1	3	3000592	3001087	206	12	23
   #tf_site_file = "GalaxyHistoryItem-300-[Pdx_Islet_BED_peakmax.txt].bed.txt" # chr3	5431143	5431144  1
   tf_site_file = "Pdx1_MM0388_7L_mm8_xset200_dupsN_ht11.sub.peaks.peakmax_3col.txt"  # chr3	3000797	1, 29 June 2009

   reads_label = "MM0386_H3K4me1_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then
         out_dir = "/home/grobertson/Profiles/MM0388_Pdx1_adult_islets/MM0388_Pdx1-islets__MM0386-H3K4me1-adult-islets/mm8/20090629"
         #out_dir = "/home/grobertson/Profiles/MM0388_Pdx1_adult_islets/MM0388_Pdx1-islets__MM0386-H3K4me1-adult-islets/mm8"
   end
   outfile = "#{run}.pm#{window_half_width}bp.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.pm#{2*window_half_width}bp.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


### 24 May 2009 ################################################################
when "MM0388-Pdx1-islets__MM0387-H3K4me3-islets"  # matched

   #tf_site_file = "GalaxyHistoryItem-300-[Pdx_Islet_BED_peakmax.txt].bed.txt" # chr3	5431143	5431144 1
   tf_site_file = "Pdx1_MM0388_7L_mm8_xset200_dupsN_ht11.sub.peaks.peakmax_3col.txt"  # chr3	3000797	1, 29 June 2009

   reads_label = "MM0387_H3K4me3_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0388_Pdx1_adult_islets/MM0388_Pdx1-islets__MM0387-H3K4me3-adult-islets/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "MM0325-Foxa2-islets__MM0282-H3K4me1-liver"  # mismatched

   tf_site_file = "Foxa2_Islet_peakmax_mm8.txt" # chr3	5431143	1

   reads_label = "MM0282_H3K4me1_adult_liver"
   reads_basefilename = "MM0282.um.eland.noDots.filtered."  # Reads filenames: MM0282.um.eland.noDots.filtered.10.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm8/eland/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0282-H3K4me1-adult-liver/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "MM0325-Foxa2-islets__MM0281-H3K4me3-liver"  # mismatched

   tf_site_file = "Foxa2_Islet_peakmax_mm8.txt" # chr3	5431143	1

   reads_label = "MM0387_H3K4me3_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0325-Foxa2-islets__MM0281-H3K4me3-adult-liver/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "RefSeq__MM0386-H3K4me1-islets"

   refseq_file = "refFlat.txt"
   case host
      when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/mm8/RefSeq"
      when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/mm8/RefSeq"
   end

   reads_label = "MM0386_H3K4me1_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0386_H3K4me1_adult_islets/export/reads"
      when "xhost"  then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/RefSeq__MM0386-H3K4me1-adult-islets/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "RefSeq__MM0387-H3K4me3-islets"

   refseq_file = "refFlat.txt"
   case host
      when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/mm8/RefSeq"
      when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/mm8/RefSeq"
   end

   reads_label = "MM0387_H3K4me3_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0387_H3K4me3_adult_islets/maq/reads"
      when "xhost"  then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost"  then out_dir = "/home/grobertson/Profiles/RefSeq__MM0387-H3K4me3-adult-islets/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
### Mouse adult liver
################################################################################
when "MM0261-Foxa2-liver__MM0282-H3K4me1-liver"   # matched

   tf_site_file = "Foxa2_Liver_peakmax_mm8.txt"

   reads_label = "MM0282_H3K4me1_adult_liver"
   reads_basefilename = "MM0282.um.eland.noDots.filtered."  # Reads filenames: MM0282.um.eland.noDots.filtered.10.reads
   case host
      when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0282_H3K4me1_adult_liver/eland/reads"
      when "xhost"  then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm8/eland/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0282-H3K4me1-adult-liver/mm8/plus-minus-10kb"
      #when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0282-H3K4me1-adult-liver/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "MM0261-Foxa2-adult-liver__MM0393-E14.5-H3K4me1-liver"   # matched

   tf_site_file = "Foxa2_Liver_peakmax_mm8.txt"

   reads_label = "MM0393_H3K4me1_E14.5_liver"
   reads_basefilename = ""  # Reads filenames: 1.reads

   case host
      when "laptop" then
         raise
         #reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0282_H3K4me1_adult_liver/eland/reads"
      when "xhost"  then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0393/mm8/export/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261_Foxa2_adult_liver/MM0261-Foxa2-adult-liver__MM0393-H3K4me1-E14.5-liver/mm8"
   end
   outfile = "#{run}.pm#{2*window_half_width}bp.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.pm#{2*window_half_width}bp.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "MM0261-Foxa2-liver__MM0281-H3K4me3-liver"   # matched

   tf_site_file = "Foxa2_Liver_peakmax_mm8.txt"

   reads_label = "MM0281_H3K4me3_adult_liver"
   reads_basefilename = "MM0281.um.eland.noDots.filtered."  # Reads filenames: MM0281.um.eland.noDots.filtered.10.reads
   case host
      when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads"
      when "xhost"  then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0281/mm8/eland/reads" # 04mai09
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0281-H3K4me3-adult-liver/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


### 24 May 2009 ################################################################
when "MM0351-Hnf4a-liver__MM0282-H3K4me1-liver"

   tf_site_file = "GalaxyHistoryItem-301-[Hnf4a_Liver_BED_peakmax.txt].bed.txt"

   reads_label = "MM0282_H3K4me1_adult_liver"
   reads_basefilename = "MM0282.um.eland.noDots.filtered."  # Reads filenames: MM0282.um.eland.noDots.filtered.10.reads
   case host
      when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0282_H3K4me1_adult_liver/eland/reads"
      when "xhost"  then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm8/eland/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0351_Hnf4a_adult_liver/MM0351-Hnf4a-liver__MM0282-H3K4me1-adult-liver/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


### 24 May 2009 ################################################################
when "MM0351-Hnf4a-liver__MM0281-H3K4me3-liver"

   tf_site_file = "GalaxyHistoryItem-301-[Hnf4a_Liver_BED_peakmax.txt].bed.txt"

   reads_label = "MM0281_H3K4me3_adult_liver"
   reads_basefilename = "MM0281.um.eland.noDots.filtered."  # Reads filenames: MM0281.um.eland.noDots.filtered.10.reads
   case host
      when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads"
      when "xhost"  then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0281/mm8/eland/reads" # 04mai09
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0351_Hnf4a_adult_liver/MM0351-Hnf4a-liver__MM0281-H3K4me3-adult-liver/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "MM0261-Foxa2-liver__MM0386-H3K4me1-islets"  # mismatched

   tf_site_file = "Foxa2_Liver_peakmax_mm8.txt"

   reads_label = "MM0386_H3K4me1_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0386/mm8/export/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0386-H3K4me1-adult-islets/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "MM0261-Foxa2-liver__MM0387-H3K4me3-islets"  # mismatched

   tf_site_file = "Foxa2_Liver_peakmax_mm8.txt"

   reads_label = "MM0387_H3K4me3_adult_islets"
   reads_basefilename = ""  # Reads filenames:  1.reads
   case host
      when "laptop" then raise "not ready yet"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0387/mm8/export/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/MM0261-Foxa2-liver__MM0387-H3K4me3-adult-islets/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "RefSeq__MM0282-H3K4me1-liver"

   refseq_file = "refFlat.txt"
   case host
      when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/mm8/RefSeq"
      when "xhost"  then ucsc_data_dir = "/home/grobertson/DATA/UCSC/mm8/RefSeq"
   end

   reads_label = "MM0282_H3K4me1_adult_liver"
   reads_basefilename = "MM0282.um.eland.noDots.filtered."  # Reads filenames: MM0282.um.eland.noDots.filtered.10.reads
   case host
      when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0282_H3K4me1_adult_liver/eland/reads"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me1/MM0282/mm8/eland/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then out_dir = "/home/grobertson/Profiles/RefSeq__MM0282-H3K4me1-adult-liver/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "RefSeq__MM0281-H3K4me3-liver"

   refseq_file = "refFlat.txt"
   case host
      when "laptop" then ucsc_data_dir = "/Users/grobertson/GENEREG/Data/UCSC/mm8/RefSeq"
      when "xhost" then ucsc_data_dir = "/home/grobertson/DATA/UCSC/mm8/RefSeq"
   end

   reads_label = "MM0281_H3K4me3_adult_liver"
   reads_basefilename = "MM0281.um.eland.noDots.filtered."  # Reads filenames: MM0281.um.eland.noDots.filtered.10.reads
   case host
      when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads"
      when "xhost" then reads_dir = "/projects/03/genereg/projects/SOLEXA/H3K4me3/MM0281/mm8/eland/reads" # 04mai09
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then  out_dir = "/home/grobertson/Profiles/RefSeq__MM0281-H3K4me3-adult-liver/mm8"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "Stat1_HeLa__HS0260-4L_H3K4me1_unstim"

   tf_site_file = "Stat1_unstim_stim_shared_BED.txt"

   reads_basefilename = "HS0260.um.eland.noDots.filtered"
   case host
      when "laptop" then reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads"
      when "xhost" then  reads_dir = "/projects/03/genereg/projects/SOLEXA/HeLa/H3K4/HS0260_4lanes/eland/reads"
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then  out_dir = "/home/grobertson/Profiles/Stat1_HeLa/HS0260-4L_H3K4me1_unstim"
   end
   outfile = "#{run}.pm#{window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.pm#{window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


################################################################################
when "Stat1_HeLa__HS0262-4L_H3K4me1_stim"

   tf_site_file = "Stat1_unstim_stim_shared_BED.txt"

   reads_basefilename = "HS0262.um.eland.noDots.filtered."
   case host
      when "laptop" then raise
         reads_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/MORGEN/Mouse_histone_mods/H3K4/MM0281_H3K4me3_adult_liver/eland/reads"
      when "xhost" then  reads_dir = "/projects/03/genereg/projects/SOLEXA/HeLa/H3K4/HS0262_4lanes/eland/reads" # 04mai09
   end

   case host
      when "laptop" then out_dir = reads_dir
      when "xhost" then  out_dir = "/home/grobertson/Profiles/Stat1_HeLa/HS0262-4L_H3K4me1_stim"
   end
   outfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.txt"
   logfile = "#{run}.w#{2*window_half_width}.x#{xset_len}.#{target_type}.#{date}.LOG.txt"


end
################################################################################


if target_type == "tf" then
   puts "tf_site_dir: #{tf_site_dir}"
   puts "tf_site_file: #{tf_site_file}"
   tf_site_pathfile = File.join( tf_site_dir, tf_site_file )
end


# Choose the TF site (or peak-max) file
#tf_site_file = "Foxa2_Islet_peakmax.txt" # chr3	5431143	1
#tf_site_file = "Foxa2_Liver_peakmax.txt"
#===========================
puts "target_type: #{target_type}"
puts "test_data_or_real_data: #{test_data_or_real_data}"
puts "host: #{host}"
puts "reads_basefilename: #{reads_basefilename}"


case test_data_or_real_data
when "test_data" then
   outfile = "#{reads_basefilename}pm#{window_half_width}.x#{xset_len}.#{target_type}.test.txt"
   logfile = "#{reads_basefilename}pm#{window_half_width}.x#{xset_len}.#{target_type}.test.LOG.txt"
end


# Profiles
out_pathfile = File.join(out_dir,outfile)
begin
  profile_wh = File.open(out_pathfile,'w')
  puts "output file is: #{out_pathfile}"
rescue
  raise "could not open #{out_pathfile}"
end
puts "writing output to: #{out_pathfile}"

# Logfile
log_pathfile = File.join(out_dir,logfile)
begin
  log_wh = File.open(log_pathfile,'w')
  puts "LOG file is: #{log_pathfile}"
rescue
  raise "could not open #{log_pathfile}"
end

log_wh.puts "script_name = #{script_name}"
log_wh.puts "version = #{version}"
log_wh.puts "host = #{host}"
log_wh.puts "target_type = #{target_type}"
log_wh.puts "test_data_or_real_data = #{test_data_or_real_data}"
log_wh.puts "xset_len = #{xset_len}"
log_wh.puts "window_half_width = #{window_half_width} bp"
log_wh.puts "tf_site_dir = #{tf_site_dir}"
log_wh.puts "tf_site_file = #{tf_site_file}"
log_wh.puts "reads_dir = #{reads_dir}"
log_wh.puts "reads_basefilename = #{reads_basefilename}.*chr*.reads"
log_wh.puts "out_dir = #{out_dir}"
log_wh.puts "outfile = #{outfile}"


### FUNCTIONS ##################################################################

################################################################################
def decimal_places(x,n)
    y = ((x * (10**n)).to_i).to_f/(10**n)
    return y
end

def commify(number)
    # Does this work on a float?
    n_to_s = number.to_s
    c = n_to_s.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
    return c
end


################################################################################
=begin
CREATE TABLE `refFlat` (
  `geneName` varchar(255) NOT NULL default '',
  `name` varchar(255) NOT NULL default '',
  `chrom` varchar(255) NOT NULL default '',
  `strand` char(1) NOT NULL default '',
  `txStart` int(10) unsigned NOT NULL default '0',
  `txEnd` int(10) unsigned NOT NULL default '0',
  `cdsStart` int(10) unsigned NOT NULL default '0',
  `cdsEnd` int(10) unsigned NOT NULL default '0',
  `exonCount` int(10) unsigned NOT NULL default '0',
  `exonStarts` longblob NOT NULL,
  `exonEnds` longblob NOT NULL,
  KEY `name` (`name`(12)),
  KEY `chrom` (`chrom`(7),`txStart`),
  KEY `chrom_2` (`chrom`(7),`txEnd`),
  KEY `geneName` (`geneName`(10))
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
Grik1	NM_010348	chr16	-	87784753	88179117	87785052	88178561	16	87784753,87803012,87811935,87824756,87829396,87835403,87836652,87838886,87846348,87884773,87895207,87898563,87923013,87940193,87944866,88178443,	87785208,87803263,87812161,87824974,87829515,87835627,87836856,87839000,87846456,87884917,87895381,87898617,87923195,87940451,87945034,88179117,
=end
def get_refseq_data_by_chr( ucsc_data_dir, refseq_file )

   refseq_pathFile = File.join( ucsc_data_dir, refseq_file )
   begin
      rs_rh = File.open( refseq_pathFile )
   rescue
      raise "can't find #{refseq_pathFile}"
   end

   nbr_chr22 = 0
   transcripts_by_chr_HoAoA = Hash.new
   n = 0
   begin
      while line = rs_rh.readline.chomp

         if line =~ /^(\S+)\s+(\S+)\s+(chr\S+)\s+([+|-])\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)/

         symbol =    $1
         id =        $2
         chr =       $3
         strand =    $4
         txStart =   $5.to_i
         txEnd =     $6.to_i
         cdsStart =  $7.to_i
         cdsEnd =    $8.to_i
         exonCount = $9.to_i

         rec = [txStart,txEnd,strand,id,symbol]

         if transcripts_by_chr_HoAoA.has_key?(chr) then transcripts_by_chr_HoAoA[chr] << rec
         else transcripts_by_chr_HoAoA[chr] = [rec]
         end

         n += 1; if (n % 10000 == 0) then puts " #{commify(n)}, #{chr}" end
         end
     end
   rescue EOFError
      rs_rh.close
   end

   puts " found #{commify(n)} RefSeq records"
  return transcripts_by_chr_HoAoA
end


################################################################################
=begin
uc001jgi.1	chr10	-	49034611	49152723	49035370	49152616	29	49034611,49041376,49046643,49049042,49050994,49052918,49053894,49056095,49058851,49062614,49062822,49063601,49065240,49070732,49079276,49084796,49090002,49100361,49101171,49110163,49114530,49116039,49117653,49118408,49120209,49122832,49127069,49129614,49152591,	49035419,49041720,49046742,49049257,49051142,49053004,49053988,49056200,49059057,49062731,49062930,49063694,49065341,49070943,49079433,49084982,49090158,49100500,49101319,49110338,49114602,49116172,49117741,49118541,49120401,49122898,49127227,49129740,49152723,	NP_001018081	uc001jgi.1
...
We load this into a Hash, gnf_gcRMA_H[probe_id] = expression_val, and find no
 duplicate ID values, so things are simple. We can get an expression value for a
 probe simply by calling a hash key.
=end
def get_knownGene_data_by_chr( ucsc_data_dir, knownGene_file, kgXref_HoA )

   knownGene_pathFile = File.join( ucsc_data_dir, knownGene_file )
   begin
      kg_rh = File.open( knownGene_pathFile )
   rescue
      raise "can't find #{knownGene_pathFile}"
   end

   knownGene_HoA = Hash.new
   n = 0
   begin
      while line = kg_rh.readline.chomp

         if line =~ /^(\S+)\s+(chr\S+)\s+([+|-])\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)/

         kg_id =     $1
         chr =       $2
         strand =    $3
         txStart =   $4.to_i
         txEnd =     $5.to_i
         cdsStart =  $6.to_i
         cdsEnd =    $7.to_i
         exonCount = $8.to_i
         kg_symbol = $11

         if kgXref_HoA.has_key?(kg_id) then
            gene_symbol = kgXref_HoA[kg_id]
         else
            puts " did not find a key in kgXref_HoA for #{kg_id}"
         end

         #symbol = "#{gene_symbol}__#{kg_symbol}"

         kg_rec = [txStart,txEnd,strand,kg_id,gene_symbol]
         # refseq: rec = [txStart,txEnd,strand,id,symbol]

         if knownGene_HoA.has_key?(chr) then knownGene_HoA[chr] << kg_rec
         else knownGene_HoA[chr] = [kg_rec]
         end

         n += 1; if (n % 10000 == 0) then puts " #{n}" end
         end
     end
   rescue EOFError
     kg_rh.close
   end

   puts " found #{commify(n)} knownGene records"
  return knownGene_HoA
end


################################################################################
=begin
kgXref.txt should give gene symbols
CREATE TABLE `kgXref` (
  `kgID` varchar(40) NOT NULL default '',
  `mRNA` varchar(40) default NULL,
  `spID` varchar(40) default NULL,
  `spDisplayID` varchar(40) default NULL,
  `geneSymbol` varchar(40) default NULL,
  `refseq` varchar(40) default NULL,
  `protAcc` varchar(40) default NULL,
  `description` longblob NOT NULL,
  KEY `kgID` (`kgID`),
  KEY `mRNA` (`mRNA`),
  KEY `spID` (`spID`),
  KEY `spDisplayID` (`spDisplayID`),
  KEY `geneSymbol` (`geneSymbol`),
  KEY `refseq` (`refseq`),
  KEY `protAcc` (`protAcc`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;


uc009vjg.1	BC048429			BC048429			Homo sapiens cDNA clone IMAGE:5275617, **** WARNING: chimeric clone ****.
kgID        mRNA           spID     spDisplayID geneSymbol refseq  protAcc          description
$1          $2             $3       $4          $5    $6             $7             $8
uc001aal.1	NM_001005484	Q8NH21	OR4F5_HUMAN	OR4F5	NM_001005484	NP_001005484	olfactory receptor, family 4, subfamily F,

=end

#def get_kgXref_data( ucsc_data_dir, knownGene_file )
#
#   kg_xref_pathFile = File.join( ucsc_data_dir, kgxref_file )
#   begin
#      kg_rh = File.open( kg_xref_pathFile )
#   rescue
#      raise "can't find #{kg_xref_pathFile}"
#   end
#
#   kg_xref_H = Hash.new  # [kg_id] = symbol
#   n = 0
#   begin
#      while line = kg_rh.readline.chomp
#
#         if line =~ /^(\S+)\t([^\t]+)\t([^\t]+)?\t([^\t]+)?\t([^\t]+)\t([^\t]+)?\t([^\t]+)?\t([^\t]+)/
#
#         kg_id =         $1
#         mrna =          $2
#         sp_id =         $3
#         sp_display_id = $4
#         gene_symbol =   $5
#         refseq =        $6
#         prot_acc =      $7
#         description =   $8
#
#         kg_rec = [txStart,txEnd,strand,kg_id,kg_symbol]
#
#         if kg_xref_H.has_key?(kg_id) then
#            raise "found duplicate key for #{kg_id}"
#         else knownGene_HoA[kg_id] = gene_symbol
#         end
#
#         n += 1; if (n % 10000 == 0) then puts " #{n}" end
#         end
#     end
#   rescue EOFError
#     kg_rh.close
#   end
#
#   puts " found #{commify(n)} knownGene records"
#  return knownGene_HoA
#end


################################################################################
=begin
30 Sept 2008
uc001aaa.2	BC032353			BC032353			Homo sapiens cDNA FLJ36366 fis, clone THYMU2007824.
uc009vip.1	AX748260			AX748260			Homo sapiens cDNA FLJ36366 fis, clone THYMU2007824.
uc009vjg.1	BC048429			BC048429			Homo sapiens cDNA clone IMAGE:5275617, **** WARNING: chimeric clone ****.
uc001aal.1	NM_001005484	Q8NH21	OR4F5_HUMAN	OR4F5	NM_001005484	NP_001005484	olfactory receptor, family 4, subfamily F,
uc009vjh.1	AY972817	Q52R92	Q52R92_HUMAN	OR4F5	NM_001005484	NP_001005484	olfactory receptor, family 4, subfamily F,
uc001aaq.1	DQ599874			DQ599874			DQ599874
uc001aar.1	DQ599768			DQ599768			DQ599768
...
=end
def get_kgXref_data( ucsc_data_dir, kgXref_file )

   puts " get_kgXref_data( ucsc_data_dir, kgXref_file )"

   kgXref_pathFile = File.join( ucsc_data_dir, kgXref_file )
   begin
      kx_rh = File.open( kgXref_pathFile )
   rescue
      raise "can't find #{kgXref_pathFile}"
   end

   kgXref_HoA = Hash.new
   kgXref_symbol_to_kg_id_HoA = Hash.new  # HoA[symbol] = [kg,kg,kg,...]

   n = 0
   n_dups = 0
   begin
      while line = kx_rh.readline.chomp

         if line =~ /(\S+)\t([^\t]+)?\t([^\t]+)?\t([^\t]+)?\t([^\t]+)\t([^\t]+)?\t([^\t]+)?\t([^\t]+)/

            kg_id =       $1
            mRNA =        $2
            spID =        $3
            spDisplayID = $4
            gene_symbol = $5
            refseq =      $6
            protAcc =     $7
            description = $8

            gene_symbol.gsub(/ +/,"_")

            if kg_id == "uc010nxr.1" || kg_id == "uc010jhd.1" then
               puts " #{line}"
            end

            if kgXref_HoA.has_key?(kg_id) then
             puts " kgXref: found duplicate known gene ID: #{kg_id}"
              raise
              #kgXref_HoA[kg_id] << gene_symbol
              n_dups += 1
            else
              kgXref_HoA[kg_id] = gene_symbol
            end

            if kgXref_symbol_to_kg_id_HoA.has_key?(gene_symbol) then

               kgXref_symbol_to_kg_id_HoA[gene_symbol] << kg_id
               #puts " found another gene symbol for #{kg_id}"

            else kgXref_symbol_to_kg_id_HoA[gene_symbol] = [kg_id]
            end

            n += 1; if (n % 10000 == 0) then puts " #{n}" end

         else
            puts "did not match #{line}"
         end
      end

  rescue EOFError
    kx_rh.close
  end

  puts " found #{commify(n)} knownGene xRef records"
  puts " loaded kg Id's for #{commify(kgXref_symbol_to_kg_id_HoA.length)} gene symbols"
  puts " loaded #{kgXref_HoA.keys.length} 'kg_id' keys"
  puts " found #{n_dups} repeated symbols for a kg_id"

   ## Now ensure that symbols are unique
   #kg_id_keys_A = kgXref_HoA.keys
   #kg_id_keys_A.each do |k|
   #   list_of_symbols_A = kgXref_HoA[k]
   #   uniq_symbols_A = list_of_symbols_A.uniq
   #   kgXref_HoA[k] = uniq_symbols_A
   #end

  return kgXref_HoA, kgXref_symbol_to_kg_id_HoA
end


################################################################################
# reads_basefilename = "MM0281.um.eland.noDots.filtered" #.1.reads"
# chr1
# reads: 3000965	-	1
def get_reads_data_by_chr( reads_dir, reads_basefilename, chr, xset_len, test_data_or_real_data )

   puts "get_reads_data_by_chr( reads_dir, reads_basefilename, chr, xset_len, test_data_or_real_data )"

   puts " #{chr}"
   chr_nbr =        chr.gsub(/chr/,"")
   reads_file =     "#{reads_basefilename}#{chr_nbr}.reads"
   #reads_file =     "#{chr_nbr}.reads"
   reads_pathfile = File.join( reads_dir, reads_file )

   begin
      in_rh = File.open(reads_pathfile,'r')
   rescue
      raise "could not open #{reads_pathfile}"
   end

   xset_recs_on_this_chr_AoA = Array.new
   nbr_of_reads_read = 0

   begin
      while line = in_rh.readline.chomp

         if line =~ /(\d+)\s+([+|-])\s+(\d+)/ then

            raw_start = $1.to_i
            strand =    $2
            count =     $3.to_i

            if strand == "+"
               xset_start = raw_start
               xset_end =   raw_start + xset_len - 1
            elsif strand == "-"
               xset_end =   raw_start
               xset_start = raw_start - xset_len + 1
            else
               raise "L303: did not recognize strand: #{strand}"
            end

            xset_recs_on_this_chr_AoA << [ xset_start, xset_end, strand, count ]
            nbr_of_reads_read += 1
            if (nbr_of_reads_read % 200000 == 0) then puts " #{commify(nbr_of_reads_read)}" end
         end
      end
   rescue EOFError
      in_rh.close
      puts " #{chr} had #{commify(nbr_of_reads_read)} reads"
   end

   return xset_recs_on_this_chr_AoA
end

#tf_site_pathfile


################################################################################
# NEW: 04 oct 09
# Read a GZIPPED file

def get_fp_bed_gz_data_by_chr( bed_gz_dir, bed_gz_basefilename, chr, xset_len, test_data_or_real_data )

   puts " #{chr}"
   #chr_nbr = chr.gsub(/chr/,"")
   # HS1238, HS1235
   bed_gz_file =     "#{chr}_#{bed_gz_basefilename}"
   bed_gz_pathfile = File.join( bed_gz_dir, bed_gz_file )
   puts " Opening #{bed_gz_pathfile}"

   if File.exists?(bed_gz_pathfile) then puts " Found #{bed_gz_pathfile}"
   else
      if File.exists?(bed_gz_dir) then puts " Found #{bed_gz_dir}"
      else puts " Could not find directory #{bed_gz_dir}"
      end
   end

   #begin
   #   in_rh = File.open(reads_pathfile,'r')
   #rescue
   #   raise "could not open #{reads_pathfile}"
   #end

   xset_recs_on_this_chr_AoA = Array.new
   nbr_of_reads_read = 0

   # ---------------------------------------------------------------------------
   begin

      File.open(bed_gz_pathfile) do |f|

         gzip = Zlib::GzipReader.new(f)

         data = gzip.read.split(/\n/)
         puts " data have #{commify(data.length)} records"

         n = 0
         data.each do |line|

            if line =~ /^track/
               puts " #{line}"
               next
            end

            # chr22	46246687	46246738	CTACAAGATTAGAAATTACAGTTTAGGGATCATGCAAGCCTCTGGCTCCa	0	+	46246687	46246738	255,0,0
            # $1     $2       $3       $4                                                 $5 $6
            if line =~ /^(chr\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+([+\-])/ then

               chr =    $1
               gstart = $2.to_i
               gend =   $3.to_i
               strand = $6

               if strand ==    "+" then
                  xset_start = gstart
                  xset_end =   gstart + xset_len - 1
               elsif strand == "-" then
                  xset_start = gend - xset_len + 1
                  xset_end = gend
               else
                  raise "L1003: did not recognize strand: #{strand}"
               end

               rec = [ xset_start, xset_end, strand ]
               xset_recs_on_this_chr_AoA << rec

               nbr_of_reads_read += 1
               if (nbr_of_reads_read % 50000 == 0) then puts "  #{commify(nbr_of_reads_read)}" end

            end # if line
         end  # do |line|
      end

   rescue

   end

   puts "  #{chr} had #{commify(nbr_of_reads_read)} reads"
   return xset_recs_on_this_chr_AoA
end


################################################################################
# chr3	8228272	8228273	6
# $1     $2       $3       $4
# 24may09 - This should not automatically handle both 3- and 4-field TF records

def get_tf_sites_by_chr( tf_site_pathfile )

   begin
      in_rh = File.open(tf_site_pathfile,'r')
   rescue
      raise "could not open #{tf_site_pathfile}"
   end

   tf_sites_by_chr_HoAoA = Hash.new
   nbr_of_tf_sites_read = 0

   begin
      while line = in_rh.readline.chomp


         ### Brad's 'new' format - 4 fields ####################################
         if line =~ /^(chr\S+)\s+(\d+)\s+(\d+)\s+(\d+)$/ then

            tgt_chr =   $1
            tgt_start = $2.to_i
            tgt_end =   $3.to_i
            tgt_id =    $4.to_i
            # There might be a 'strand' in the future

            tgt_rec_A = [ tgt_start, tgt_id ]

            if tf_sites_by_chr_HoAoA.has_key?(tgt_chr) then
               tf_sites_by_chr_HoAoA[tgt_chr] << tgt_rec_A
            else
               tf_sites_by_chr_HoAoA[tgt_chr] = [tgt_rec_A]
            end

            nbr_of_tf_sites_read += 1
            if (nbr_of_tf_sites_read % 10000 == 0) then puts " #{commify(nbr_of_tf_sites_read)}" end
            #if tgt_score > 200 then puts " [#{nbr_of_tf_sites_read}] #{tgt_score}" end


         ### Brad's 'old' format - 3 fields
         elsif line =~ /^(chr\S+)\s+(\d+)\s+(\d+)$/ then

            tgt_chr =   $1
            tgt_start = $2.to_i
            tgt_id =    $3.to_i
            # There might be a 'strand' in the future

            tgt_rec_A = [ tgt_start, tgt_id ]

            if tf_sites_by_chr_HoAoA.has_key?(tgt_chr) then
               tf_sites_by_chr_HoAoA[tgt_chr] << tgt_rec_A
            else
               tf_sites_by_chr_HoAoA[tgt_chr] = [tgt_rec_A]
            end

            nbr_of_tf_sites_read += 1
            if (nbr_of_tf_sites_read % 10000 == 0) then puts " #{commify(nbr_of_tf_sites_read)}" end
            #if tgt_score > 200 then puts " [#{nbr_of_tf_sites_read}] #{tgt_score}" end


         #######################################################################
         ### Standard FP2 format with subpeak ID: 1	3	3000592	3001087	206	12	23
         # Corrected FP2 subpeaks, 19june09
         elsif line =~ /^(\d+)\s(\S+)\t(\d+)\t(\d+)\t(\d+)\s(\d+)\s(\d+)$/
            #             id     chr    start  end  maxOffset ht   reads
            #             $1     $2     $3     $4     $5     $6     $7
            tgt_id =         $1.to_i
            tgt_chrom =      $2
            tgt_start =      $3.to_i
            tgt_end =        $4.to_i
            tgt_max_offset = $5.to_i

            tgt_chr = "chr#{tgt_chrom}"
            tgt_coord = tgt_start + tgt_max_offset - 1
            tgt_rec_A = [ tgt_coord, tgt_id ]

            if tf_sites_by_chr_HoAoA.has_key?(tgt_chr) then
               tf_sites_by_chr_HoAoA[tgt_chr] << tgt_rec_A
            else
               tf_sites_by_chr_HoAoA[tgt_chr] = [tgt_rec_A]
            end

            nbr_of_tf_sites_read += 1
            if (nbr_of_tf_sites_read % 10000 == 0) then puts " #{commify(nbr_of_tf_sites_read)} FP2 records" end
            #if tgt_score > 200 then puts " [#{nbr_of_tf_sites_read}] #{tgt_score}" end


         #######################################################################
         # 26 Sept 2010
         # FindPeaks differential enrichment coordinate call
         elsif line =~ /(chr\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)/
         #               chr1	      1667520	1434	1581	   13.3042	1.0    0.0
         #               $1         $2       $3    $4       $5       $6     $7
            tgt_chr =   $1
            tgt_coord = $2.to_i
            tgt_id =    $1 + "_" + $2
            tgt_pval =  $7.to_f

            tgt_rec_A = [ tgt_coord, tgt_id ]

            if tf_sites_by_chr_HoAoA.has_key?(tgt_chr) then
               tf_sites_by_chr_HoAoA[tgt_chr] << tgt_rec_A
            else
               tf_sites_by_chr_HoAoA[tgt_chr] = [tgt_rec_A]
            end

            nbr_of_tf_sites_read += 1
            if (nbr_of_tf_sites_read % 1000 == 0) then puts " #{commify(nbr_of_tf_sites_read)} DE records" end


         else  #################################################################
            puts line
            raise "L1414: did not recognize TF file format"

         end
      end
   rescue EOFError
      in_rh.close
      puts " read #{commify(nbr_of_tf_sites_read)} TF site recs"
   end

   return tf_sites_by_chr_HoAoA
end


################################################################################
# chr3	8228272	8228273	6
# $1     $2       $3       $4
# 24may09 - This should not automatically handle both 3- and 4-field TF records

def get_de_sites_by_chr( de_coords_pathfile )

   begin
      in_rh = File.open(de_coords_pathfile,'r')
   rescue
      raise "could not open #{de_coords_pathfile}"
   end

   de_coords_by_chr_HoAoA = Hash.new
   nbr_of_de_coords_read = 0

   begin
      while line = in_rh.readline.chomp

         #######################################################################
         # 26 Sept 2010
         # FindPeaks differential enrichment coordinate call
         if line =~ /(chr\S+)\s+(\d+)\s+(\d+)\s+(-?\d+)\s+(\S+)\s+(\S+)\s+(\S+)/
         #               chr1	      1667520	1434	1581	   13.3042	1.0    0.0
         #               $1         $2       $3    $4       $5       $6     $7
            tgt_chr =   $1
            tgt_coord = $2.to_i
            tgt_id =    $1 + "_" + $2
            tgt_pval =  $7.to_f

            tgt_rec_A = [ tgt_coord, tgt_id ]

            if de_coords_by_chr_HoAoA.has_key?(tgt_chr) then
               de_coords_by_chr_HoAoA[tgt_chr] << tgt_rec_A
            else
               de_coords_by_chr_HoAoA[tgt_chr] = [tgt_rec_A]
            end

            nbr_of_de_coords_read += 1
            if (nbr_of_de_coords_read % 1000 == 0) then puts " #{commify(nbr_of_de_coords_read)} DE records" end


         else  #################################################################
            puts line
            raise "L1537: did not recognize DE file format"

         end
      end
   rescue EOFError
      in_rh.close
      puts " read #{commify(nbr_of_de_coords_read)} DE-coord recs"
   end

   return de_coords_by_chr_HoAoA
end


################################################################################
def generate_nonTSS_profile_for_overlapped_xsets(chr, run,
                              nbr_tgt_recs_processed_on_this_chr,
                              tgt_coord, xset_len, window_half_width,
                              xset_recs_on_this_chr_AoA,
                              nbr_target_recs_on_this_chr)

   # Generate a window around this site, using absolute coords
   rgn_start = tgt_coord - window_half_width
   rgn_end =   tgt_coord + window_half_width - 1
   rgn_len =   rgn_end - rgn_start + 1

   # Probably smart to check overlap against a rgn that has been *extended* by XSET
   # length on either end, so that we only need to check for fully-contained overlap,
   # and can ignore checking for partial overlap.
   # Then we can just take a slice out of the extended profile that corresponds
   # to the central region.
   extended_rgn_start = rgn_start - xset_len + 1
   extended_rgn_end =   rgn_end + xset_len - 1
   extended_rgn_len =   extended_rgn_end - extended_rgn_start
   extended_profile_A = Array.new( rgn_len, 0 )  # Initialize array values to 0

   # Use a hash that has absolute genomic coordinates as keys (to facilitate updating a profile)
   extended_profile_H = Hash.new
   (extended_rgn_start..extended_rgn_end).each { |abs_coord| extended_profile_H[abs_coord] = 0 }

   # xset_recs_on_this_chr_AoA << [ xset_start, xset_end, strand, count ]
   #                                0           1         2       3

   sites_that_overlap_current_rgn_AoA = Array.new

   xsets_that_overlap_extended_rgn_AoA = xset_recs_on_this_chr_AoA.find_all { |xset_rec_A|
      (extended_rgn_start <= xset_rec_A[0]) && (extended_rgn_end >= xset_rec_A[1])
   }

   nbr_of_overlapping_xsets = xsets_that_overlap_extended_rgn_AoA.length
   puts " #{run}: #{chr} (#{nbr_tgt_recs_processed_on_this_chr}/#{nbr_target_recs_on_this_chr}): #{nbr_of_overlapping_xsets} overlapping xsets"

   if nbr_of_overlapping_xsets > 0 then

      #sorted_xsets_that_overlap_extended_rgn_AoA = xsets_that_overlap_extended_rgn_AoA.sort_by { |xset_rec| [xset_recs_on_this_chr_AoA[0]] }

      xsets_that_overlap_extended_rgn_AoA.each do |xset_rec_A|
         xset_abs_start = xset_rec_A[0]
         xset_abs_end =   xset_rec_A[1]
         (xset_abs_start..xset_abs_end).each { |coord| extended_profile_H[coord] += 1 }
      end

   else
      #puts " #{current_site_motif_name} (#{current_site_motif_id}): overlaps no other site"
   end

   # Now extract the profile height array
   extracted_profile_hts_A = Array.new
   (rgn_start..rgn_end).each { |abs_coord| extracted_profile_hts_A << extended_profile_H[abs_coord] }

   return extracted_profile_hts_A
end

################################################################################
# 17 Oct. 09, needed for long regions. Not implemented yet.
def check_whether_region_overlaps_a_nearby_gene()

end

################################################################################
# This does strand-corrected profiles around TSS's.
# Last modified 10 May 2009.

def generate_TSS_profile_for_overlapped_xsets(chr, run,
                              nbr_tgt_recs_processed_on_this_chr,
                              tgt_coord, tgt_strand,
                              xset_len, window_half_width,
                              xset_recs_on_this_chr_AoA,
                              nbr_target_recs_on_this_chr)

   # Generate a window around this site, using absolute coords
   rgn_start = tgt_coord - window_half_width
   rgn_end =   tgt_coord + window_half_width - 1
   rgn_len =   rgn_end - rgn_start + 1

   # Probably smart to check overlap against a rgn that has been *extended* by XSET
   # length on either end, so that we only need to check for fully-contained overlap,
   # and can ignore checking for partial overlap.
   # Then we can just take a slice out of the extended profile that corresponds
   # to the central region.
   extended_rgn_start = rgn_start - xset_len + 1
   extended_rgn_end =   rgn_end + xset_len - 1
   extended_rgn_len =   extended_rgn_end - extended_rgn_start


   extended_profile_A = Array.new( rgn_len, 0 )  # Initialize array values to 0

   # Use a hash that has absolute genomic coordinates as keys (to facilitate updating a profile)
   extended_profile_H = Hash.new
   (extended_rgn_start..extended_rgn_end).each { |abs_coord| extended_profile_H[abs_coord] = 0 }

   # xset_recs_on_this_chr_AoA << [ xset_start, xset_end, strand, count ]
   #                                0           1         2       3

   sites_that_overlap_current_rgn_AoA = Array.new

   xsets_that_overlap_extended_rgn_AoA = xset_recs_on_this_chr_AoA.find_all { |xset_rec_A|
      (extended_rgn_start <= xset_rec_A[0]) && (extended_rgn_end >= xset_rec_A[1])
   }

   nbr_of_overlapping_xsets = xsets_that_overlap_extended_rgn_AoA.length
   puts " #{run}: #{chr} (#{nbr_tgt_recs_processed_on_this_chr}/#{nbr_target_recs_on_this_chr}): #{nbr_of_overlapping_xsets} overlapping xsets"

   if nbr_of_overlapping_xsets > 0 then

      #sorted_xsets_that_overlap_extended_rgn_AoA = xsets_that_overlap_extended_rgn_AoA.sort_by { |xset_rec| [xset_recs_on_this_chr_AoA[0]] }

      xsets_that_overlap_extended_rgn_AoA.each do |xset_rec_A|
         xset_abs_start = xset_rec_A[0]
         xset_abs_end =   xset_rec_A[1]
         (xset_abs_start..xset_abs_end).each { |coord| extended_profile_H[coord] += 1 }
      end

   else
      #puts " #{current_site_motif_name} (#{current_site_motif_id}): overlaps no other site"
   end

   # Now extract the profile height array
   extracted_profile_hts_A = Array.new
   (rgn_start..rgn_end).each { |abs_coord| extracted_profile_hts_A << extended_profile_H[abs_coord] }

   #strand_corrected_profile_hts_A = Array.new
   if tgt_strand == "+" then strand_corrected_profile_hts_A = extracted_profile_hts_A
   elsif tgt_strand == "-" then strand_corrected_profile_hts_A = extracted_profile_hts_A.reverse
   else raise "L911: did not recognize strand #{tgt_strand}"
   end

   return strand_corrected_profile_hts_A
end


################################################################################
# This is for strand-corrected =asymmetric= profiles around TSS's.
# This is new and is not working yet.
# It should return stranded i.e. 5'-to-3'-oriented profiles.
# We should also check whether this region overlaps the end of any other gene/transcript. (and respond when it does)
# Last modified 17 October 2009.

def generate_asymmetric_TSS_profile_for_overlapped_xsets(
      chr, run,
      nbr_tgt_recs_processed_on_this_chr,
      tgt_coord, tgt_strand, tgt_id, tgt_symbol,
      xset_len,
      # window_half_width,
      window_5pr_width, window_3pr_width,
      xset_recs_on_this_chr_AoA,
      nbr_target_recs_on_this_chr,
      transcript_recs_on_this_chr_AoA
      )

   # Generate a window around this site, using absolute coords
   # Probably smart to check overlap against a rgn that has been *extended* by XSET
   # length on either end, so that we only need to check for fully-contained overlap,
   # and can ignore checking for partial overlap.
   # Then we can just take a slice out of the extended profile that corresponds
   # to the central region.


   # What we should do:
   # 1. Generate a 'naive' sequence region with a naiveStart (left end) and naiveEnd (right end).
   #    'Naive' start and end are as they'd appear at the UCSC genome browser, and is the convention used at UCSC for gene coordinates.
   #    5'-|---------------------->-----|-3' (+)
   #                     3'-|-----<----------------------|-5' (-)
   # 2. Calculate the profile for the naive region, regardless of the target strand.
   # 3. If it's a (-)-strand target, reverse the region list just before returning it. This makes it 5'-to-3' oriented.
   #    Reversed (-)-strand profile that is returned
   #    5'-|---------------------->-----|-3' (-/reversed)

   case tgt_strand

      when "+"

         naive_rgn_start = tgt_coord - window_5pr_width + 1
         naive_rgn_end =   tgt_coord + window_3pr_width
         rgn_len =         naive_rgn_end - naive_rgn_start + 1

         extended_rgn_start = naive_rgn_start - xset_len
         extended_rgn_end =   naive_rgn_end + xset_len
         extended_rgn_len =   extended_rgn_end - extended_rgn_start

      when "-"

         naive_rgn_end =   tgt_coord + window_5pr_width
         naive_rgn_start = tgt_coord - window_3pr_width + 1
         rgn_len =         naive_rgn_end - naive_rgn_start + 1

         extended_rgn_end =   naive_rgn_end + xset_len
         extended_rgn_start = naive_rgn_start - xset_len
         extended_rgn_len =   extended_rgn_start - extended_rgn_end

   end

   # Overlap has many cases!!
   # RefSeq transcripts: rec = [txStart,txEnd,strand,id,symbol]
   transcripts_that_overlap_extended_rgn_AoA = transcript_recs_on_this_chr_AoA.find_all { |tscpt_rec_A|

      (extended_rgn_start <= tscpt_rec_A[0]) && (extended_rgn_end >= tscpt_rec_A[1])

   }

   #--- OLD code ---------------------------------------------------------------

   extended_profile_A = Array.new( rgn_len, 0 )  # Initialize array values to 0

   # Use a hash that has absolute genomic coordinates as keys (to facilitate updating a profile)
   extended_profile_H = Hash.new
   (extended_rgn_start..extended_rgn_end).each { |abs_coord| extended_profile_H[abs_coord] = 0 }

   # xset_recs_on_this_chr_AoA << [ xset_start, xset_end, strand, count ]
   #                                0           1         2       3

   sites_that_overlap_current_rgn_AoA = Array.new

   xsets_that_overlap_extended_rgn_AoA = xset_recs_on_this_chr_AoA.find_all { |xset_rec_A|
      (extended_rgn_start <= xset_rec_A[0]) && (extended_rgn_end >= xset_rec_A[1])
   }

   nbr_of_overlapping_xsets = xsets_that_overlap_extended_rgn_AoA.length
   puts " #{run}: #{chr} (#{nbr_tgt_recs_processed_on_this_chr}/#{nbr_target_recs_on_this_chr}) #{tgt_id} #{tgt_symbol} (#{tgt_strand}): #{nbr_of_overlapping_xsets} overlapping xsets"

   if nbr_of_overlapping_xsets > 0 then

      #sorted_xsets_that_overlap_extended_rgn_AoA = xsets_that_overlap_extended_rgn_AoA.sort_by { |xset_rec| [xset_recs_on_this_chr_AoA[0]] }

      xsets_that_overlap_extended_rgn_AoA.each do |xset_rec_A|
         xset_abs_start = xset_rec_A[0]
         xset_abs_end =   xset_rec_A[1]
         (xset_abs_start..xset_abs_end).each { |coord| extended_profile_H[coord] += 1 }
      end

   else
      #puts " #{current_site_motif_name} (#{current_site_motif_id}): overlaps no other site"
   end

   # Now extract the profile height array
   extracted_profile_hts_A = Array.new

   (naive_rgn_start..naive_rgn_end).each { |abs_coord| extracted_profile_hts_A << extended_profile_H[abs_coord] }

   #strand_corrected_profile_hts_A = Array.new
   if tgt_strand == "+" then
      strand_corrected_profile_hts_A = extracted_profile_hts_A
   elsif tgt_strand == "-" then
      strand_corrected_profile_hts_A = extracted_profile_hts_A.reverse
   else raise "L911: did not recognize strand #{tgt_strand}"
   end

   return strand_corrected_profile_hts_A

   #--- OLD code ---------------------------------------------------------------

end


################################################################################
def get_hg18_chr_list()

  chrs_A = [
      "chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX"
      #"chrM",
      #"chrX" #,
      #"chrY"
      ]

  chrom_lens_H = {
      "chr1" => 247249719,
      "chr2" => 242951149,
      "chr3" => 199501827,
      "chr4" => 191273063,
      "chr5" => 180857866,
      "chr6" => 170899992,
      "chr7" => 158821424,
      "chr8" => 146274826,
      "chr9" => 140273252,
      "chr10" => 135374737,
      "chr11" => 134452384,
      "chr12" => 132349534,
      "chr13" => 114142980,
      "chr14" => 106368585,
      "chr15" => 100338915,
      "chr16" => 88827254,
      "chr17" => 78774742,
      "chr18" => 76117153,
      "chr19" => 63811651,
      "chr20" => 62435964,
      "chr21" => 46944323,
      "chr22" => 49691432,
      "chrX" => 154913754 #,
      #"chrM" => 16571,
      #"chrY" => 57772954
  }

   puts "...hg18 chromInfo OK"
  return chrs_A,chrom_lens_H
end

def get_mm9_chr_list()

  chrs_A = [
      "chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chrX"
      #"chrM",
      #"chrY"
      ]

   chrom_lens_H = {
      "chr1" => 197195432,
      "chr2" => 181748087,
      "chr3" => 159599783,
      "chr4" => 155630120,
      "chr5" => 152537259,
      "chr6" => 149517037,
      "chr7" => 152524553,
      "chr8" => 131738871,
      "chr9" => 124076172,
      "chr10" => 129993255,
      "chr11" => 121843856,
      "chr12" => 121257530,
      "chr13" => 120284312,
      "chr14" => 125194864,
      "chr15" => 103494974,
      "chr16" => 98319150,
      "chr17" => 95272651,
      "chr18" => 90772031,
      "chr19" => 61342430,
      "chrX" => 166650296,
      #["chrY" => 15902555],
      #["chrM" => 16299],
   }

   return chrs_A,chrom_lens_H
end

def get_mm8_chr_list()

  chrs_A = [
      "chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chrX"
      #"chrM",
      #"chrY"
      ]

   chrom_lens_H = {
      "chr1" => 197069962,
      "chr2" => 181976762,
      "chr3" => 159872112,
      "chr4" => 155029701,
      "chr5" => 152003063,
      "chr6" => 149525685,
      "chr7" => 145134094,
      "chr8" => 132085098,
      "chr9" => 124000669,
      "chr10" => 129959148,
      "chr11" => 121798632,
      "chr12" => 120463159,
      "chr13" => 120614378,
      "chr14" => 123978870,
      "chr15" => 103492577,
      "chr16" => 98252459,
      "chr17" => 95177420,
      "chr18" => 90736837,
      "chrX" => 165556469,
      "chrY" => 16029404,
      "chrM" => 16299
   }

   return chrs_A,chrom_lens_H
end


################################################################################
### MAIN #######################################################################

puts "1. Get chromInfo"
#-------------------------------------------------------------------------------
case assembly
when "mm8"
   puts "getting mm8 chromInfo"
   chrs_A,chrom_lens_H = get_mm8_chr_list()
when "mm9"
   puts "getting mm9 chromInfo"
   chrs_A,chrom_lens_H = get_mm9_chr_list()
when "hg18"
   puts "getting hg18 chromInfo"
   chrs_A,chrom_lens_H = get_hg18_chr_list()
end


#-------------------------------------------------------------------------------
#puts "\n1. get_gnf_gcRMA_CD4pTcell_expression_data()"
#gnf_gcRMA_H = get_gnf_gcRMA_CD4pTcell_expression_data( su_data_dir, gnf_gcRMA_CD4pTcell_file )
# gnf_gcRMA_H[probe_id] = expression_val

case target_type
when "tss"
   #----------------------------------------------------------------------------
   #puts "\n2. get_knownGene_data()"
   #knownGene_HoA = get_knownGene_data_by_chr( ucsc_data_dir, knownGene_file )
   # kg_rec = [txStart,txEnd,strand,kg_id,kg_symbol]
   # kgXref_symbol_to_kg_id_HoA[gene_symbol] = [kg_id,kg_id,kg_id,...]
   #----------------------------------------------------------------------------
   #puts "\n3. get_kgXref_data()"
   #kgXref_HoA, kgXref_symbol_to_kg_id_HoA = get_kgXref_data( ucsc_data_dir, kgXref_file )
   # kgXref_H[kg_id] = geneSymbol

   puts "\n2. Get RefSeq or knownGene transcripts"
   case gene_type

      when "refFlat.txt"
      puts "refFlat.txt"
      #----------------------------------------------------------------------------
      transcripts_by_chr_HoAoA = get_refseq_data_by_chr( ucsc_data_dir, transcript_file )
      #----------------------------------------------------------------------------
      # rec = [txStart,txEnd,strand,id,symbol]

      when "knownGene.txt"
      puts "knownGene.txt"
      #----------------------------------------------------------------------------
      kgXref_HoA, kgXref_symbol_to_kg_id_HoA = get_kgXref_data( ucsc_data_dir, kgXref_file )
      # return kgXref_HoA, kgXref_symbol_to_kg_id_HoA

      transcripts_by_chr_HoAoA = get_knownGene_data_by_chr( ucsc_data_dir, transcript_file, kgXref_HoA )
      #----------------------------------------------------------------------------
      # kg_rec = [txStart,txEnd,strand,kg_id,kg_symbol]

   end


when "tf"

   puts "\n2. get TF site data"
   #----------------------------------------------------------------------------
   tf_sites_by_chr_HoAoA = get_tf_sites_by_chr( tf_site_pathfile )
   #----------------------------------------------------------------------------
   # tgt_rec_A = [ tgt_start, tgt_id ]


when "de_coords"

   puts "\n2. Get DE coord data"
   #----------------------------------------------------------------------------
   de_coords_by_chr_HoAoA = get_de_sites_by_chr( de_coords_pathfile )
   #----------------------------------------------------------------------------
   # tgt_rec_A = [ tgt_start, tgt_id ]


end


puts "\n4. get reads data, per chromosome"

#reads_chrs_A.each do |chr|
chrs_A.each do |chr|

   case data_type
   when "reads"
   # For either genes/TSS's or TF sites as targets
   #----------------------------------------------------------------------------
   # Get reads for this chr reads_basefilename = "MM0281.um.eland.noDots.filtered" #.1.reads"
   xset_recs_on_this_chr_AoA = get_reads_data_by_chr( reads_dir,
                                                     reads_basefilename,
                                                     chr,
                                                     xset_len,
                                                     test_data_or_real_data
                                                     )
   #----------------------------------------------------------------------------

   when "fp_bed_gz"
   #----------------------------------------------------------------------------
   # chr1_HS1238_H4ac_hg18_maq.bed.gz
   # gunzip
   # Then:
   xset_recs_on_this_chr_AoA = get_fp_bed_gz_data_by_chr( bed_gz_dir,
                                                      bed_gz_basefilename,
                                                      chr,
                                                      xset_len,
                                                      test_data_or_real_data
                                                      )
   #gzip
   #----------------------------------------------------------------------------
   end


   # Now choose transcript TSSs or TF sites or DE-coords #######################
   case target_type

   #############################################################################
   when "tss"

      # Get all target locations and strands on this chr
      transcript_recs_on_this_chr_AoA = transcripts_by_chr_HoAoA[chr]
      nbr_target_recs_on_this_chr = transcript_recs_on_this_chr_AoA.length
      puts "\n #{reads_label}: #{chr} has #{nbr_target_recs_on_this_chr} records"

      # Now process each transcript record, one at a time
      nbr_tgt_recs_processed_on_this_chr = 0
      transcript_recs_on_this_chr_AoA.each do |tgt_rec|

         nbr_tgt_recs_processed_on_this_chr += 1

         # rec = [txStart,txEnd,strand,id,symbol]  RefSeq, 10 May 2009
         #        0       1     2      3  4

         tgt_id =     tgt_rec[3]
         tgt_symbol = tgt_rec[4]
         tgt_strand = tgt_rec[2]

         if    tgt_strand == "+" then tgt_coord = tgt_rec[0]
         elsif tgt_strand == '-' then tgt_coord = tgt_rec[1]
         else  raise "did not recognize strand #{strand}"
         end

         # 10 May 2009 - this 'if' lets us debug results
         # Comment it out for an actual run.
         tgt_ids_A = %w[ NM_009460 NM_018868 NM_007561 NM_001045513 ]

         #if tgt_ids_A.include?(tgt_id) then
            #puts "\ngenerating a profile for #{tgt_id}"
            strand_corrected_profile_hts_A = Array.new
            #----------------------------------------------------------------------

            case window
            when "symmetric"
               #strand_corrected_profile_hts_A = generate_TSS_profile_for_overlapped_xsets(
               #                        chr, run,
               #                        nbr_tgt_recs_processed_on_this_chr,
               #                        tgt_coord, tgt_strand,
               #                        xset_len, window_half_width,
               #                        xset_recs_on_this_chr_AoA,
               #                        nbr_target_recs_on_this_chr
               #                        )
            when "asymmetric"
               strand_corrected_profile_hts_A = generate_asymmetric_TSS_profile_for_overlapped_xsets(
                                       chr, run,
                                       nbr_tgt_recs_processed_on_this_chr,
                                       tgt_coord, tgt_strand, tgt_id, tgt_symbol,
                                       xset_len,
                                       # window_half_width,
                                       window_5pr_width, window_3pr_width,
                                       xset_recs_on_this_chr_AoA,
                                       nbr_target_recs_on_this_chr,
                                       transcript_recs_on_this_chr_AoA
                                       )

            end
            #----------------------------------------------------------------------

            area_under_the_profile = 0
            strand_corrected_profile_hts_A.each { |ht| area_under_the_profile += ht }

            center_of_strand_corrected_profile_hts_A = strand_corrected_profile_hts_A[999..2999]
            area_under_the_centre_of_the_profile = 0
            center_of_strand_corrected_profile_hts_A.each { |ht| area_under_the_centre_of_the_profile += ht }


            case test_data_or_real_data
            when "real_data"
               profile_wh.puts [ tgt_id, tgt_symbol, chr, tgt_coord, tgt_strand, area_under_the_profile, area_under_the_centre_of_the_profile, strand_corrected_profile_hts_A.join("\t") ].join("\t")
            when "test_data"
               profile_wh.puts strand_corrected_profile_hts_A.join("\n")
            end
         #end # End of debugging loop

      end


   #############################################################################
   when "tf"

      # Get all target locations and strands on this chr
      target_recs_on_this_chr_AoA = tf_sites_by_chr_HoAoA[chr]
      nbr_target_recs_on_this_chr = target_recs_on_this_chr_AoA.length
      puts "\n #{reads_label}: #{chr} has #{nbr_target_recs_on_this_chr} TF sites"

      nbr_tgt_recs_processed_on_this_chr = 0


      target_recs_on_this_chr_AoA.each do |tgt_rec_A|  #########################

         nbr_tgt_recs_processed_on_this_chr += 1

         # tgt_rec_A << [ tgt_start, tgt_id ]
         #                0          1

         tgt_start = tgt_rec_A[0].to_i
         tgt_id =    tgt_rec_A[1].to_i

         tgt_strand = "+"  # Currently fixed to '+' strand

         if tgt_strand == "+" then
            tgt_coord = tgt_rec_A[0]
         elsif tgt_strand == '-'
            raise "have only '-' strand TF sites for now"
         else
            raise "did not recognize strand #{tgt_strand}"
         end

         #----------------------------------------------------------------------
         case window
         when "symmetric"
            # This is what is already working
            profile_heights_A = generate_nonTSS_profile_for_overlapped_xsets(
                                    chr, run,
                                    nbr_tgt_recs_processed_on_this_chr,
                                    tgt_coord, xset_len, window_half_width,
                                    xset_recs_on_this_chr_AoA,
                                    nbr_target_recs_on_this_chr
                                    )
         when "asymmetric"
            # This is new and is not working yet. It should return strand-oriented (5'-to-3') profiles.
            profile_heights_A = generate_asymmetric_TSS_profile_for_overlapped_xsets(
                              chr, run,
                              nbr_tgt_recs_processed_on_this_chr,
                              tgt_coord, tgt_strand,
                              xset_len,
                              # window_half_width,
                              window_5pr_width, window_3pr_width,
                              xset_recs_on_this_chr_AoA,
                              nbr_target_recs_on_this_chr)
         end
         #----------------------------------------------------------------------

         area_under_the_profile = 0
         profile_heights_A.each { |ht| area_under_the_profile += ht }

         case test_data_or_real_data
         when "real_data"
            profile_wh.puts [ tgt_id, ".", chr, tgt_coord, "+", area_under_the_profile, profile_heights_A.join("\t") ].join("\t")
         when "test_data"
            profile_wh.puts profile_heights_A.join("\n")
         end
      end


   #############################################################################
   when "de_coords"

      # Get all target locations and strands on this chr
      # get_de_sites_by_chr( de_coords_pathfile )
      target_recs_on_this_chr_AoA = de_coords_by_chr_HoAoA[chr]
      nbr_target_recs_on_this_chr = target_recs_on_this_chr_AoA.length
      puts "\n #{reads_label}: #{chr} has #{nbr_target_recs_on_this_chr} DE coords"

      nbr_tgt_recs_processed_on_this_chr = 0


      target_recs_on_this_chr_AoA.each do |tgt_rec_A|  #########################

         nbr_tgt_recs_processed_on_this_chr += 1

         # tgt_rec_A = [ tgt_coord, tgt_id ]
         #               0          1

         tgt_coord = tgt_rec_A[0].to_i
         tgt_id =    tgt_rec_A[1]

         #tgt_strand = "+"  # Currently fixed to '+' strand
         #
         #if tgt_strand == "+" then
         #   tgt_coord = tgt_rec_A[0]
         #elsif tgt_strand == '-'
         #   raise "have only '-' strand DE sites for now"
         #else
         #   raise "did not recognize strand #{tgt_strand}"
         #end

         #----------------------------------------------------------------------
         case window
         when "symmetric"
            # This is what is already working
            profile_heights_A = generate_nonTSS_profile_for_overlapped_xsets(
                                    chr, run,
                                    nbr_tgt_recs_processed_on_this_chr,
                                    tgt_coord, xset_len, window_half_width,
                                    xset_recs_on_this_chr_AoA,
                                    nbr_target_recs_on_this_chr
                                    )
         when "asymmetric"
            # This is new and is not working yet. It should return strand-oriented (5'-to-3') profiles.
            profile_heights_A = generate_asymmetric_TSS_profile_for_overlapped_xsets(
                              chr, run,
                              nbr_tgt_recs_processed_on_this_chr,
                              tgt_coord, tgt_strand,
                              xset_len,
                              # window_half_width,
                              window_5pr_width, window_3pr_width,
                              xset_recs_on_this_chr_AoA,
                              nbr_target_recs_on_this_chr)
         end
         #----------------------------------------------------------------------

         area_under_the_profile = 0
         profile_heights_A.each { |ht| area_under_the_profile += ht }

         case test_data_or_real_data
         when "real_data"
            profile_wh.puts [ tgt_id, ".", chr, tgt_coord, "+", area_under_the_profile, profile_heights_A.join("\t") ].join("\t")
         when "test_data"
            profile_wh.puts profile_heights_A.join("\n")
         end
      end


   end

end
profile_wh.close
log_wh.close

puts "\ndone! See: #{out_pathfile}"