#!/usr/bin/env ruby =begin compile_dna-methylation_matrix.rb 06 January 2011, 22h00 INTENT: compile a set of DNA methylation files into a matrix of gene/beta vs patient, for AML patients 2. Check a list of samples to include. Hybridization REF TCGA-AB-2802-03A-01D-0741-05 TCGA-AB-2802-03A-01D-0741-05 TCGA-AB-2802-03A-01D-0741-05 TCGA-AB-2802-03A-01D-0741-05 Composite Element REF Beta_Value Gene_Symbol Chromosome Genomic_Coordinate cg00000292 0.751965148 ATP2A1 16 28797601 cg00002426 0.840713877 SLMAP 3 57718583 cg00003994 0.249756836 MEOX2 7 15692387 cg00005847 0.145857643 HOXD3 2 176737319 ... cg00012792 0.078874995 TXNDC5 6 8009492 cg00013618 NA IGL@ 22 20710839 ... =end # xhost09 #basedir = "/projects/tcga_analysis/external_data/dna_meth/jhu-usc.edu_LAML.HumanMethylation27.Level_3.1.0.0" basedir = "/projects/remc_bigdata/TCGA/external_data/DNA_methylation" # Mac laptop #basedir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/external_data/DNA_methylation" Dir.chdir(basedir) #Dir.chdir("/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/AML/data/genomic_wash-u/test") missing_probes = %w( cg08579995 cg09632585 cg12240237 cg19282452 cg23444894 cg24150528 cg25629118 cg25770411 cg26222045 ) ################################################################################ # TCGA-AB-2840 # xhost09 shared_lis_dir = "/projects/tcga_analysis/RNA_seq/expression/dna_meth" # local Mac #shared_lis_dir = "/Users/grobertson/GENEREG/ChIP/CHIP_SEQ/SOLEXA/TCGA/external_data/DNA_methylation" shared_tcga_ids = "tcga_patients_dnaMeth_rnaSeq_mirnaSeq.txt" shared_pathfile = File.join( shared_lis_dir, shared_tcga_ids ) begin sh_rh = File.open( shared_pathfile ) rescue raise "READ: could not open #{shared_pathfile}" end shared_ids_arr = Array.new begin while line = sh_rh.readline.chomp shared_ids_arr << line end rescue EOFError sh_rh.close puts "read #{shared_ids_arr.length} shared probe IDs" end ################################################################################ # xhost09 #outdir = "/projects/remc_bigdata/TCGA/external_data/DNA_methylation" # localhost outdir = basedir output_filename = "LAML.HumanMethylation27.Level_3.1.0.0.matrix.txt" output_pathfile = File.join( outdir, output_filename ) begin wh = File.open( output_pathfile, 'w' ) rescue raise "WRITE: could not open #{output_filename}" end ################################################################################ ################################################################################ pattern = File.join( basedir, "jhu-usc.edu_LAML.HumanMethylation27*.txt" ) samples = Dir.glob( pattern ) ################################################################################ beta = nil probe_info_hoa = Hash.new matrix_hoa = Hash.new matrix_header_arr = %w( probe_id gene_symbol chr coordinate ) nbr_of_samples_processed = 0 nbr_of_samples_retained = 0 ################################################################################ samples.each do |samplefile| nbr_of_samples_processed += 1 puts "\n#{nbr_of_samples_processed}: #{samplefile}" #sample_pathfile = File.join( basedir, samplefile ) begin rh = File.open(samplefile) rescue raise "could not open #{samplefile}" end # read just the first line and test whether we retain this sample's data line = rh.readline.chomp if line =~ /Hybridization REF\t(TCGA-AB-\d{4})-\S{3}-\S{3}-\S{4}-\S{2}\t(\S+)\t(\S+)\t(\S+)/ then tcga_patient_id = $1 id2 = $2 id3 = $3 id4 = $5 ################################################ if shared_ids_arr.member?(tcga_patient_id) then # include this sample in matrix nbr_of_samples_retained += 1 matrix_header_arr << tcga_patient_id else puts " We will NOT retain this sample's data in the matrix" rh.close next end end # If we get to this point, then we're going to read the whole file. data_lines_processed = 0 begin while line = rh.readline.chomp if line =~ /^Composite/ then # keep going elsif line =~ /^(cg\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)/ probe_id = $1 raw_beta = $2 gene_symbol = $3 chrom = $4 coordinate = $5.to_i data_lines_processed += 1 chr = "chr#{chrom}" if raw_beta =~ /N\S+/ then beta = raw_beta elsif raw_beta =~ /\d\.?\d?/ then # allow '1' beta = raw_beta.to_f else puts "unexpected beta value: #{line}: beta = #{beta}" raise end # On the first sample, create the first four columns of the table as: probeID, symbol, chr, coordinate, and add the beta value # For all subsequent samples, just add the patient_id to the header, and add the beta value to the table, for the correct probeID if probe_info_hoa.has_key?(probe_id) then # If this is sample_nbr 1, this is an error or a puzzle. if nbr_of_samples_retained == 1 then raise "duplicate probe ID for sample 1. Why? #{line}" # If this is a subsequent sample, then we should find the right number of records already in this probe's feature vector elsif nbr_of_samples_retained > 1 then curr_vector = probe_info_hoa[probe_id] updated_vector = curr_vector << beta probe_info_hoa[probe_id] = updated_vector #if ( data_lines_processed % 1000 == 0 ) then puts " #{data_lines_processed}: (#{updated_vector.join(",")})" end end else # we did not find this probe_id in the current HashOfArrays if nbr_of_samples_retained == 1 then # start a HoA record for this probe_id probe_info_hoa[probe_id] = [ gene_symbol, chr, coordinate, beta ] elsif nbr_of_samples_retained > 1 then #a puzzle. Stop and figure it out. raise "puzzle! line 123" end end end end rescue EOFError rh.close # Check that we have the expected number of probes in the HoA expected_nbr_of_probes = 27578 # 2 rows less than 27580 from 'wc -l' probe_list = probe_info_hoa.keys if (probe_list.length != expected_nbr_of_probes) then #puts samplefile puts " expected number of probes = 27578" puts " found #{probe_list.length} probe IDs" puts " unexpected number of probe IDs" #raise end end end ################################################################################ # Now write out the results wh.puts matrix_header_arr.join("\t") probe_list = probe_info_hoa.keys.sort puts "\nwriting out #{probe_list.length} rows of data" p = 0 probe_list.each do |probe| p += 1 probe_data = probe_info_hoa[probe] if (p % 1000 == 0) then puts [p, probe, probe_data].join("\t") end wh.puts [probe, probe_data].join("\t") end wh.close puts "\ndone! considered #{nbr_of_samples_processed} retained #{nbr_of_samples_retained} samples"