View Javadoc

1   package baseCode.io.reader;
2   
3   import java.io.BufferedReader;
4   import java.io.File;
5   import java.io.FileInputStream;
6   import java.io.IOException;
7   import java.io.InputStream;
8   import java.io.InputStreamReader;
9   import java.util.Collections;
10  import java.util.HashMap;
11  import java.util.HashSet;
12  import java.util.Iterator;
13  import java.util.Map;
14  import java.util.Set;
15  import java.util.StringTokenizer;
16  import java.util.Vector;
17  
18  import baseCode.dataStructure.matrix.NamedMatrix;
19  import baseCode.dataStructure.matrix.SparseDoubleMatrix2DNamed;
20  
21  import com.braju.beta.format.Format;
22  import com.braju.beta.format.FormatReader;
23  import com.braju.beta.format.FormatString;
24  import com.braju.beta.format.Parameters;
25  import com.braju.beta.lang.DoubleVariable;
26  import com.braju.beta.lang.IntegerVariable;
27  import com.braju.beta.lang.NumberVariable;
28  
29  /***
30   * <hr>
31   * <p>
32   * Copyright (c) 2004 Columbia University
33   * 
34   * @author pavlidis
35   * @version $Id: SparseDoubleMatrixReader.java,v 1.4 2004/08/16 02:06:01 pavlidis Exp $
36   */
37  public class SparseDoubleMatrixReader extends AbstractNamedMatrixReader {
38  
39     /***
40      * @param filename data file to read from
41      * @return NamedMatrix object constructed from the data file
42      * @throws IOException
43      */
44     public NamedMatrix read( String filename ) throws IOException {
45        return read( filename, null );
46     }
47  
48     /***
49      * @param stream InputStream stream to read from
50      * @return NamedMatrix object constructed from the data file
51      * @throws IOException
52      */
53     public NamedMatrix read( InputStream stream ) throws IOException {
54        return read( stream, null );
55     }
56  
57     /***
58      * Read a matrix from a file, subject to filtering criteria.
59      * 
60      * @param filename data file to read from
61      * @param wantedRowNames contains names of rows we want to get
62      * @return NamedMatrix object constructed from the data file
63      * @throws IOException
64      */
65     public NamedMatrix read( String filename, Set wantedRowNames )
66           throws IOException {
67        File infile = new File( filename );
68        if ( !infile.exists() || !infile.canRead() ) {
69           throw new IOException( "Could not read from file " + filename );
70        }
71        FileInputStream stream = new FileInputStream( infile );
72        return read( stream, wantedRowNames );
73     } // end read
74  
75     /***
76      * Read a sparse matrix in "JW" (Jason Weston) format. The format is like this:
77      * 
78      * <pre>          
79      *               2          &lt;--- number of items - the first line of the file only. NOTE - this line is often blank or not present.
80      *               1 2        &lt;--- items 1 has 2 edges
81      *               1 2        &lt;--- edge indices are to items 1 &amp; 2
82      *               0.1 100    &lt;--- with the following weights
83      *               2 2        &lt;--- items 2 also has 2 edges
84      *               1 2        &lt;--- edge indices are also to items 1 &amp; 2 (fully connected)
85      *               100 0.1    &lt;--- with the following weights
86      * </pre>
87      * 
88      * <p>
89      * Note that the item numbering starts at 1. This is a requirement.
90      * <p>
91      * Note that this cannot handle very large matrices - the limit to rows x columns is the number Integer.MAX_VALUE.
92      * This is an implementation problem for colt's sparse matrix.
93      * 
94      * @param stream
95      * @param wantedRowNames
96      * @return @throws IOException
97      */
98     public NamedMatrix readJW( InputStream stream ) throws IOException,
99           IllegalAccessException, NoSuchFieldException {
100 
101       BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
102       FormatReader ff = new FormatReader( dis );
103       Parameters p = new Parameters();
104       FormatString fmtdd = ff.compileFormatString( "%d %d" );
105       FormatString fmtd = ff.compileFormatString( "%d" );
106       FormatString fmtg = ff.compileFormatString( "%g" );
107 
108       NumberVariable index = new IntegerVariable();
109       NumberVariable amount = new IntegerVariable();
110       DoubleVariable eval = new DoubleVariable();
111 
112       int dim = 10;
113 
114       dim = new Integer( Integer.parseInt( dis.readLine() ) ).intValue();
115       SparseDoubleMatrix2DNamed returnVal = new SparseDoubleMatrix2DNamed( dim,
116           dim );
117 
118       for ( int k = 1; k <= dim; k++ ) {
119 
120          returnVal.addColumnName( new Integer( k ).toString(), k - 1 );
121          returnVal.addRowName( new Integer( k ).toString(), k - 1 );
122 
123          ff.read( fmtdd, p.add( index ).add( amount ) ); // "item 1 has 2 edges"
124 
125          if ( ( index.intValue() % 500 ) == 0 ) {
126             log.debug( Format.sprintf(
127                   "loading %2.1f%% complete (%dth entry)... \n", p.add(
128                         100.0 * ( ( index.intValue() ) / ( ( float ) dim ) ) )
129                         .add( index ) ) );
130          }
131 
132          int[] rowind = new int[amount.intValue()];
133          for ( int i = 0; i < amount.intValue(); i++ ) { // "edge indices are to 1 and 2"
134 
135             Format.fscanf( ff, fmtd, p.add( index ) );
136             int ind = index.intValue();
137 
138             if ( ind > dim || ind < 1 ) {
139                throw new IllegalStateException( "Illegal value " + ind
140                      + " found in index list for item " + k );
141             }
142             rowind[i] = ind;
143          }
144 
145          for ( int i = 0; i < amount.intValue(); i++ ) { // "with the following weights"
146             Format.fscanf( ff, fmtg, p.add( eval ) );
147             returnVal.setQuick( k - 1, rowind[i] - 1, eval.doubleValue() );
148         //    returnVal.setQuick( rowind[i] - 1, k - 1, eval.doubleValue() );
149          }
150 
151       }
152       ff.close();
153       return returnVal;
154    }
155 
156    /***
157     * Read a sparse matrix that is expressed as an adjacency list in a tab-delimited file:
158     * 
159     * <pre>
160     *               item1 item2 weight
161     *               item1 item5 weight
162     * </pre>
163     * 
164     * <p>
165     * By definition the resulting matrix is square and symmetric.
166     * </p>
167     * <p>
168     * Note that the ordering of the items will be as they are encountered in the file.
169     * 
170     * @param stream InputStream
171     * @param wantedRowNames Set
172     * @return NamedMatrix
173     * @throws IOException
174     */
175    public NamedMatrix read( InputStream stream, Set wantedRowNames )
176          throws IOException {
177 
178       Set itemNames = new HashSet();
179       Map rows = new HashMap();
180 
181       BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
182 
183       String row;
184       int index = 0;
185       Map nameIndexMap = new HashMap(); // name --> eventual row index
186       while ( ( row = dis.readLine() ) != null ) {
187          StringTokenizer st = new StringTokenizer( row, " \t", false );
188 
189          String itemA = "";
190           
191          if ( st.hasMoreTokens() ) {
192             itemA = st.nextToken();
193 
194             if ( !itemNames.contains( itemA ) ) {
195                rows.put( itemA, new HashSet() );
196                itemNames.add( itemA );
197                nameIndexMap.put( itemA, new Integer( index ) );
198                index++;
199             }
200          } else {
201           //  continue;
202          }
203 
204          String itemB = "";
205          if ( st.hasMoreTokens() ) {
206             itemB = st.nextToken();
207             if ( !itemNames.contains( itemB ) ) {
208                rows.put( itemB, new HashSet() );
209                itemNames.add( itemB );
210                nameIndexMap.put( itemB, new Integer( index ) );
211                index++;
212             }
213          } else {
214           //  continue;
215          }
216 
217          double weight;
218          if ( st.hasMoreTokens() ) {
219             weight = Double.parseDouble( st.nextToken() );
220          } else {
221             weight = 1.0; // just make it a binary matrix.
222          }
223 
224          ( ( Set ) rows.get( itemA ) ).add( new IndexScoreDyad(
225                ( ( Integer ) nameIndexMap.get( itemB ) ).intValue(), weight ) );
226          ( ( Set ) rows.get( itemB ) ).add( new IndexScoreDyad(
227                ( ( Integer ) nameIndexMap.get( itemA ) ).intValue(), weight ) );
228       }
229 
230       SparseDoubleMatrix2DNamed matrix = new SparseDoubleMatrix2DNamed(
231             itemNames.size(), itemNames.size() );
232       
233       Vector itemVec = new Vector( itemNames );
234       Collections.sort(itemVec);
235       
236       matrix.setColumnNames( itemVec  );
237       matrix.setRowNames( itemVec );
238       for ( Iterator iter = itemNames.iterator(); iter.hasNext(); ) {
239          String itemA = ( String ) iter.next();
240          int rowIndex = matrix.getRowIndexByName( itemA );
241          Set arow = ( Set ) rows.get( itemA );
242          for ( Iterator iterator = arow.iterator(); iterator.hasNext(); ) {
243             IndexScoreDyad element = ( IndexScoreDyad ) iterator.next();
244             int ind = element.getKey();
245             double weight = element.getValue();
246           
247             matrix.setQuick( rowIndex, ind, weight );
248             matrix.setQuick( ind, rowIndex, weight );
249          }
250 
251       }
252 
253       dis.close();
254       return matrix;
255    }
256 
257    /* (non-Javadoc)
258     * @see baseCode.io.reader.AbstractNamedMatrixReader#readOneRow(java.io.BufferedReader)
259     */
260    public NamedMatrix readOneRow( BufferedReader dis ) throws IOException {
261       // TODO Auto-generated method stub
262       // this is impossible for the pair method.
263       throw new UnsupportedOperationException();
264    }
265 
266 }
267 
268