package ca.bcgsc.abyssexplorer.parsers; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import ca.bcgsc.abyssexplorer.graph.AbyssGraph; /** * Custom parser for ABySS output (compatible with * output files from ABySS version 1.1.0 and higher). * * @author Cydney Nielsen * */ public class AbyssParser implements GraphParser { protected BufferedReader inputStream; protected String mode; // one of 'adj', 'dist', or 'fasta' // adj patterns // ABySS 1.1.2 versions - e.g. "0 49 101 ; 58- 71- ; 3+ 78-" protected static Pattern adjPattern_1_1_2 = Pattern.compile("^\\s*(\\d+) (\\d+) (\\d+)\t;( \\d+[+-])*\t;( \\d+[+-])*\\s*$", Pattern.MULTILINE); // ABySS 1.1.0 (and 1.1.1) versions - e.g. "4 46 3926+ 47847- 569231+ 588716- ; 1077218- 1077220- 1677662+" protected static Pattern adjPattern_1_1_0 = Pattern.compile("^\\s*(\\d+) (\\d+)( \\d+[+-])* ;( \\d+[+-])*\\s*$", Pattern.MULTILINE); // dist pattern // ABySS 1.1.x versions - e.g. "5 7-,-17,718,0.7 ; 11-,-46,24,3.8 61+,-46,1168,0.5" protected static Pattern distPattern = Pattern.compile("^\\s*(\\d+)( \\d+[+-],-?\\d+,\\d+,\\d+\\.\\d+)* ;( \\d+[+-],-?\\d+,\\d+,\\d+\\.\\d+)*\\s*$", Pattern.MULTILINE); // fasta header pattern - e.g. ">1770690 299 4826 1696362+,69659-" (last field optional) protected static Pattern fastaPattern = Pattern.compile("^>(\\d+) (\\d+) (\\d+)( \\d+[+-](,\\d+[+-])*)?\\s*$", Pattern.MULTILINE); protected Matcher ap_1_1_2; protected Matcher ap_1_1_0; protected Matcher dp; protected Matcher fp; protected Object dummyO = new Object(); protected void regexSetup(CharSequence s) { ap_1_1_2 = adjPattern_1_1_2.matcher(s); ap_1_1_0 = adjPattern_1_1_0.matcher(s); dp = distPattern.matcher(s); fp = fastaPattern.matcher(s); } public void open(File f) throws IOException { String fName = f.getAbsolutePath(); try { inputStream = new BufferedReader(new FileReader(fName)); if (fName.endsWith("adj")) { mode = "adj"; } else if (fName.endsWith("dist")) { mode = "dist"; } else if (fName.endsWith("contigs.fa")) { mode = "fasta"; } else { throw (new IOException("Unknown file type: " + fName)); } } catch (FileNotFoundException e) { throw (new IOException("No such file: " + fName)); } } public void close() throws IOException { inputStream.close(); } public AbyssGraph initializeGraph() throws IOException { if (!mode.equals("adj")) { throw (new IOException("Must specify a .adj file")); } // count the number of vertices int vCount = 0; // find the largest contig label id // not equivalent to single-end contig count in ABySS int maxE = 0; while (inputStream.ready()) { AdjacentContigs aContigs = parseAdjLine(); if (aContigs == null) {continue;} // no match to known patterns vCount += 1; // one connection from single source vertex to each target int e = Integer.parseInt(aContigs.getLabel().replace("+", "").replace("-", "")); if (e > maxE) { maxE = e; } } int eCount = maxE + 1; // 0 based labels // System.out.println("found " + vCount + " vertices and " + eCount + " edges"); // actual vCount typically < eCount, but > eCount/2, so using vCount=eCount reasonable estimate AbyssGraph g = new AbyssGraph(eCount, eCount); return g; } /** * Returns null if at the end of the file; * Returns a dummy Object if no match to known patterns */ public Object parseNextLine() throws IOException { Object o = null; String s = inputStream.readLine(); if (s != null) { regexSetup(s); if (mode.equals("adj")) { o = matchAdj(s); } else if (mode.equals("dist")) { o = matchDist(s); } else if (mode.equals("fasta")) { o = matchFasta(s); if (o == null) { // unimportant line in fasta file o = dummyO; } } else { throw new IOException("Unknown file format"); } } return o; } protected AdjacentContigs parseAdjLine() throws IOException { AdjacentContigs aContigs = null; String s = inputStream.readLine(); if (s != null) { regexSetup(s); aContigs = matchAdj(s); } return aContigs; } /** * Parse adjacency information from input string * Compatible with output files from ABySS 1.1.0 and higher. * Returns null if no match. * @param s * @return */ protected AdjacentContigs matchAdj(String s) throws IOException { AdjacentContigs aContigs = null; if (ap_1_1_2.find() == true) { aContigs = matchAdj_1_1_2(s); } else if (ap_1_1_0.find() == true) { aContigs = matchAdj_1_1_0(s); } return aContigs; } /** * Parse ABySS 1.1.0/1.1.1 adjacency information from input string * (e.g. "0 47 69+ 78+ ; 36+ 41-"). Returns null if no match. * @param s * @return */ public AdjacentContigs matchAdj_1_1_0(String s) throws IOException { String[] parts = s.split(" "); String label = parts[0] + "+"; Integer len = Integer.parseInt(parts[1]); List outbound = new ArrayList(); List inbound = new ArrayList(); boolean isOutbound = true; for (int i=2; i outbound = new ArrayList(); List inbound = new ArrayList(); String[] oTemp = parts[1].split(" "); String[] iTemp = parts[2].split(" "); if (oTemp.length > 1) { for (String o: oTemp) { if (o.equals(";")) { continue; } outbound.add(o); } } if (iTemp.length > 1) { for (String i: iTemp) { if (i.equals(";")) { continue; } inbound.add(i); } } AdjacentContigs aContigs = null; if (label != null) { aContigs = new AdjacentContigs(label, len, cov); } if (outbound.size() != 0) { aContigs.addOutbound(outbound); } if (inbound.size() != 0) { aContigs.addInbound(inbound); } return aContigs; } /** * Parse distance estimate information from input string * (e.g. "5 7-,-17,718,0.7 ; 11-,-46,24,3.8 61+,-46,1168,0.5"). * Returns null if no match. * @param s * @return * @throws IOException */ protected List matchDist(String s) throws IOException { if (!dp.find()) { return null; } String[] parts = s.split(" "); List dList = new ArrayList(parts.length-2); // extract source label int sId = Integer.parseInt(parts[0]); byte sStrand = 0; // file format specifies positive strand orientation // extract targets and stats int dId; byte dStrand; int dist; int nPairs; float error; boolean outbound = true; for (int i=1; i1770690 299 4826 1696362+,69659-"). * Returns null if no match. * @param s * @return * @throws IOException */ public PairedEndContig matchFasta(String s) throws IOException { if (!fp.find()) { return null; } String[] parts = s.split(" "); PairedEndContig pe = null; pe = new PairedEndContig(Integer.parseInt(fp.group(1))); // parts[1] = length pe.setCoverage(Integer.parseInt(parts[2])); if (parts.length == 4) { for (String m: parts[3].split(",")) { pe.addMember(m); } } return pe; } }