View Javadoc

1   package baseCode.io.reader;
2   
3   import java.io.BufferedReader;
4   import java.io.FileInputStream;
5   import java.io.IOException;
6   import java.io.InputStream;
7   import java.io.InputStreamReader;
8   import java.util.HashMap;
9   import java.util.HashSet;
10  import java.util.Map;
11  import java.util.Set;
12  import java.util.StringTokenizer;
13  
14  import baseCode.dataStructure.matrix.NamedMatrix;
15  import baseCode.dataStructure.matrix.RCDoubleMatrix1D;
16  import baseCode.dataStructure.matrix.SparseRaggedDoubleMatrix2DNamed;
17  import baseCode.util.FileTools;
18  import cern.colt.list.DoubleArrayList;
19  import cern.colt.list.IntArrayList;
20  import cern.colt.map.OpenIntDoubleHashMap;
21  import cern.colt.map.OpenIntIntHashMap;
22  import cern.colt.map.OpenIntObjectHashMap;
23  import cern.colt.matrix.DoubleMatrix1D;
24  
25  /***
26   * Best data structure for reading really big, really sparse matrices when a matrix represetation is needed. *
27   * <p>
28   * The standard format looks like this:
29   * 
30   * <pre>
31   * 
32   *  
33   *   
34   *    
35   *     
36   *      
37   *       
38   *        
39   *         
40   *          
41   *           
42   *            
43   *             
44   *              
45   *               
46   *                
47   *                                         2          &lt;--- number of items - the first line of the file only. NOTE - this line is often blank or not present.
48   *                                         1 2        &lt;--- items 1 has 2 edges
49   *                                         1 2        &lt;--- edge indices are to items 1 &amp; 2
50   *                                         0.1 100    &lt;--- with the following weights
51   *                                         2 2        &lt;--- items 2 also has 2 edges
52   *                                         1 2        &lt;--- edge indices are also to items 1 &amp; 2 (fully connected)
53   *                                         100 0.1    &lt;--- with the following weights
54   *                
55   *               
56   *              
57   *             
58   *            
59   *           
60   *          
61   *         
62   *        
63   *       
64   *      
65   *     
66   *    
67   *   
68   *  
69   * </pre>
70   * 
71   * <hr>
72   * <p>
73   * Copyright (c) 2004 Columbia University
74   * 
75   * @author pavlidis
76   * @version $Id: SparseRaggedDouble2DNamedMatrixReader.java,v 1.22 2005/01/05 02:01:02 pavlidis Exp $
77   */
78  public class SparseRaggedDouble2DNamedMatrixReader extends
79        AbstractNamedMatrixReader {
80  
81     /***
82      * Read a sparse symmetric square matrix that is expressed as an adjacency list in a tab-delimited file:
83      * 
84      * <pre>
85      * 
86      *  
87      *   
88      *    
89      *     
90      *      
91      *       
92      *        
93      *         
94      *          
95      *           
96      *            
97      *                           item1 item2 weight
98      *                           item1 item5 weight
99      *             
100     *            
101     *           
102     *          
103     *         
104     *        
105     *       
106     *      
107     *     
108     *    
109     *   
110     *  
111     * </pre>
112     * 
113     * <p>
114     * IMPORTANT: By definition the resulting matrix is square and symmetric, even if the symmetric edges are not
115     * explicitly listed.
116     * 
117     * @param name of file
118     * @return
119     */
120    public NamedMatrix readFromAdjList( String fileName ) throws IOException {
121       if ( !FileTools.testFile( fileName ) ) {
122          throw new IOException( "Could not read from file " + fileName );
123       }
124       FileInputStream stream = new FileInputStream( fileName );
125       return readFromAdjList( stream );
126    }
127 
128    /***
129     * @throws IOException
130     * @throws NumberFormatException Read a sparse symmetric square matrix that is expressed as an adjacency list in a
131     *         tab-delimited file:
132     * 
133     * <pre>
134     * 
135     *  
136     *   
137     *    
138     *     
139     *      
140     *       
141     *        
142     *         
143     *          
144     *           
145     *            
146     *              item1 item2 weight
147     *              item1 item5 weight
148     *             
149     *            
150     *           
151     *          
152     *         
153     *        
154     *       
155     *      
156     *     
157     *    
158     *   
159     *  
160     * </pre>
161     * 
162     * <p>
163     *         IMPORTANT: By definition the resulting matrix is square and symmetric, even if the symmetric edges are not
164     *         explicitly listed.
165     * @param stream
166     * @return
167     */
168    public NamedMatrix readFromAdjList( InputStream stream )
169          throws NumberFormatException, IOException {
170       Set itemNames = new HashSet();
171       Map rows = new HashMap();
172 
173       BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
174 
175       OpenIntObjectHashMap indexNameMap = new OpenIntObjectHashMap(); // eventual row index --> name
176       Map nameIndexMap = new HashMap(); // name --> eventual row index
177 
178       /*
179        * Store the information about the matrix in a temporary set of data structures, the most important of which is a
180        * map of nodes to edge information. Each edge information object contains the index and the weight of the edge.
181        */
182       String row;
183       int index = 0;
184       while ( ( row = dis.readLine() ) != null ) {
185          StringTokenizer st = new StringTokenizer( row, " \t", false );
186 
187          String itemA = "";
188          if ( st.hasMoreTokens() ) {
189             itemA = st.nextToken();
190             if ( !itemNames.contains( itemA ) ) {
191                rows.put( itemA, new OpenIntDoubleHashMap() );
192                itemNames.add( itemA );
193                indexNameMap.put( index, itemA );
194                nameIndexMap.put( itemA, new Integer( index ) );
195                ( ( OpenIntDoubleHashMap ) rows.get( itemA ) ).put( index, 0 ); // to itself. - in case it isn't there.
196                index++;
197             }
198          } else
199             continue;
200 
201          String itemB = "";
202          if ( st.hasMoreTokens() ) {
203             itemB = st.nextToken();
204             if ( !itemNames.contains( itemB ) ) {
205                rows.put( itemB, new OpenIntDoubleHashMap() );
206                itemNames.add( itemB );
207                indexNameMap.put( index, itemB );
208                nameIndexMap.put( itemB, new Integer( index ) );
209                ( ( OpenIntDoubleHashMap ) rows.get( itemB ) ).put( index, 0 ); // to itself. - in case it isn't there.
210                index++;
211             }
212          } else
213             continue;
214 
215          double weight;
216          if ( st.hasMoreTokens() ) {
217             weight = Double.parseDouble( st.nextToken() );
218          } else {
219             weight = 1.0; // just make it a binary matrix.
220          }
221 
222          int aind = ( ( Integer ) nameIndexMap.get( itemA ) ).intValue();
223          int bind = ( ( Integer ) nameIndexMap.get( itemB ) ).intValue();
224 
225      //    if (itemA.equals("CYP4A11") || itemB.equals("CYP4A11")) 
226    //      System.err.println( itemA + " " + itemB + " " + aind + " " + bind );
227 
228          ( ( OpenIntDoubleHashMap ) rows.get( itemA ) ).put( bind, weight ); // link a to b.
229          ( ( OpenIntDoubleHashMap ) rows.get( itemB ) ).put( aind, weight ); // link b to a.
230          
231          if ( ( rows.size() % 500 ) == 0 ) {
232             log.info( new String( "loading  " + index + "th pair" ) );
233          }
234       }
235       dis.close();
236 
237       SparseRaggedDoubleMatrix2DNamed matrix = new SparseRaggedDoubleMatrix2DNamed();
238 
239       for ( int i = 0; i < indexNameMap.size(); i++ ) {
240          String itemName = ( String ) indexNameMap.get( i );
241 
242          OpenIntDoubleHashMap arow = ( OpenIntDoubleHashMap ) rows
243                .get( itemName );
244 
245          DoubleArrayList finalValues = new DoubleArrayList( arow.size() );
246 
247     //     System.err.println( itemName + " has " + arow.size() + " links" );
248          IntArrayList inB = arow.keys();
249          inB.sort();
250          int[] rowMemberIndexes = inB.elements();
251        //  System.err.println( itemName + " " + i + " " + inB );
252          
253          for ( int j = 0; j < rowMemberIndexes.length; j++ ) {
254             int itemNumber = rowMemberIndexes[j]; // keys
255             double weight = arow.get( itemNumber );
256             finalValues.add( weight );
257          }
258 
259          DoubleMatrix1D rowMatrix = new RCDoubleMatrix1D( inB, finalValues );
260          matrix.addRow( itemName, rowMatrix );
261 
262          if ( i > 0 && ( i % 500 ) == 0 ) {
263             log.info( new String( "Adding  " + i + "th row" ) );
264          }
265       }
266       return matrix;
267    }
268 
269    /*
270     * (non-Javadoc)
271     * 
272     * @see baseCode.io.reader.AbstractNamedMatrixReader#read(java.lang.String)
273     */
274    public NamedMatrix read( String fileName ) throws IOException {
275       if ( !FileTools.testFile( fileName ) ) {
276          throw new IOException( "Could not read from file " + fileName );
277       }
278       FileInputStream stream = new FileInputStream( fileName );
279       return read( stream );
280    }
281 
282    public NamedMatrix readOneRow( BufferedReader dis ) throws IOException {
283       return this.readOneRow( dis, 0 );
284    }
285 
286    /***
287     * Use this to read one row from a matrix (JW format). It does not close the reader. (this actually has to read
288     * several lines to get the data for one matrix row)
289     * 
290     * @param stream
291     * @param offset A value indicating the lowest value for the indexes listed. This is here in case the indexes in the
292     *        stream are numbered starting from 1 instead of zero.
293     * @return @throws IOException
294     */
295    public NamedMatrix readOneRow( BufferedReader dis, int offset )
296          throws IOException {
297       SparseRaggedDoubleMatrix2DNamed returnVal = new SparseRaggedDoubleMatrix2DNamed();
298 
299       String row = dis.readLine(); // line containing the id and the number of edges.
300       StringTokenizer tok = new StringTokenizer( row, " \t" );
301 
302       int index = Integer.parseInt( tok.nextToken() );
303       int amount = Integer.parseInt( tok.nextToken() );
304       String rowName = new Integer( index ).toString();
305       returnVal.addRow( rowName, readOneRow( dis, amount, offset ) );
306       return returnVal;
307    }
308 
309    /***
310     * Read an entire sparse matrix from a stream (JW format).
311     * 
312     * @param stream
313     * @return @throws IOException
314     */
315    public NamedMatrix read( InputStream stream ) throws IOException {
316       return this.read( stream, 0 );
317    }
318 
319    /***
320     * Read an entire sparse matrix from a stream (JW format).
321     * 
322     * @param stream
323     * @param offset A value indicating the lowest value for the indexes listed. This is here in case the indexes in the
324     *        stream are numbered starting from 1 instead of zero.
325     * @return @throws IOException
326     */
327    public NamedMatrix read( InputStream stream, int offset ) throws IOException {
328       BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
329       SparseRaggedDoubleMatrix2DNamed returnVal = new SparseRaggedDoubleMatrix2DNamed();
330 
331       String row;
332       int k = 1;
333 
334       while ( ( row = dis.readLine() ) != null ) {
335 
336          if ( row.equals( "" ) ) { // in case there is a blank line at the top.
337             continue;
338          }
339 
340          StringTokenizer tok = new StringTokenizer( row, " \t" );
341          if ( tok.countTokens() != 2 ) { // in case the row count is there.
342             continue;
343          }
344 
345          int index = Integer.parseInt( tok.nextToken() ) - offset;
346          int amount = Integer.parseInt( tok.nextToken() );
347 
348          if ( ( index % 500 ) == 0 ) {
349             log.info( new String( "loading  " + index + "th entry" ) );
350          }
351 
352          returnVal.addRow( new Integer( k ).toString(), readOneRow( dis,
353                amount, offset ) );
354 
355          k++;
356       }
357 
358       dis.close();
359       return returnVal;
360    }
361 
362    private DoubleMatrix1D readOneRow( BufferedReader dis, int amount, int offset )
363          throws IOException {
364 
365       /*
366        * we have to be careful to skip any lines that invalid. Each line should have at least two characters. In the
367        * files JW provided there are some lines that are just " ".
368        */
369       String rowInd = "";
370       String rowWei = "";
371 
372       //     while ( rowInd.length() < 2 ) {
373       rowInd = dis.readLine(); // row with indices.
374       //    }
375 
376       //    while ( rowWei.length() < 2 ) {
377       rowWei = dis.readLine(); // row with weights.
378       //    }
379 
380       StringTokenizer tokw = new StringTokenizer( rowWei, " \t" );
381       StringTokenizer toki = new StringTokenizer( rowInd, " \t" );
382 
383       OpenIntIntHashMap map = new OpenIntIntHashMap( amount, 0.4, 0.8 );
384       DoubleArrayList values = new DoubleArrayList( amount );
385       DoubleArrayList finalValues = new DoubleArrayList( amount );
386 
387       int i = 0;
388       while ( toki.hasMoreTokens() ) {
389 
390          double weight = Double.parseDouble( tokw.nextToken() );
391          int ind = Integer.parseInt( toki.nextToken() ) - offset;
392 
393          if ( ind < 0 ) {
394             throw new IllegalStateException(
395                   "Can't have negative index - check offset." );
396          }
397 
398          map.put( ind, i );
399          values.add( weight );
400          i++;
401       }
402 
403       IntArrayList indexes = map.keys();
404       indexes.sort();
405       int[] ix = indexes.elements();
406       int size = ix.length;
407       for ( int j = 0; j < size; j++ ) {
408          finalValues.add( values.get( map.get( ix[j] ) ) );
409       }
410 
411       return new RCDoubleMatrix1D( indexes, finalValues );
412    }
413 
414 }