1 package baseCode.io.reader;
2
3 import java.io.BufferedReader;
4 import java.io.FileInputStream;
5 import java.io.IOException;
6 import java.io.InputStream;
7 import java.io.InputStreamReader;
8 import java.util.HashMap;
9 import java.util.HashSet;
10 import java.util.Map;
11 import java.util.Set;
12 import java.util.StringTokenizer;
13
14 import baseCode.dataStructure.matrix.NamedMatrix;
15 import baseCode.dataStructure.matrix.RCDoubleMatrix1D;
16 import baseCode.dataStructure.matrix.SparseRaggedDoubleMatrix2DNamed;
17 import baseCode.util.FileTools;
18 import cern.colt.list.DoubleArrayList;
19 import cern.colt.list.IntArrayList;
20 import cern.colt.map.OpenIntDoubleHashMap;
21 import cern.colt.map.OpenIntIntHashMap;
22 import cern.colt.map.OpenIntObjectHashMap;
23 import cern.colt.matrix.DoubleMatrix1D;
24
25 /***
26 * Best data structure for reading really big, really sparse matrices when a matrix represetation is needed. *
27 * <p>
28 * The standard format looks like this:
29 *
30 * <pre>
31 *
32 *
33 *
34 *
35 *
36 *
37 *
38 *
39 *
40 *
41 *
42 *
43 *
44 *
45 *
46 *
47 * 2 <--- number of items - the first line of the file only. NOTE - this line is often blank or not present.
48 * 1 2 <--- items 1 has 2 edges
49 * 1 2 <--- edge indices are to items 1 & 2
50 * 0.1 100 <--- with the following weights
51 * 2 2 <--- items 2 also has 2 edges
52 * 1 2 <--- edge indices are also to items 1 & 2 (fully connected)
53 * 100 0.1 <--- with the following weights
54 *
55 *
56 *
57 *
58 *
59 *
60 *
61 *
62 *
63 *
64 *
65 *
66 *
67 *
68 *
69 * </pre>
70 *
71 * <hr>
72 * <p>
73 * Copyright (c) 2004 Columbia University
74 *
75 * @author pavlidis
76 * @version $Id: SparseRaggedDouble2DNamedMatrixReader.java,v 1.22 2005/01/05 02:01:02 pavlidis Exp $
77 */
78 public class SparseRaggedDouble2DNamedMatrixReader extends
79 AbstractNamedMatrixReader {
80
81 /***
82 * Read a sparse symmetric square matrix that is expressed as an adjacency list in a tab-delimited file:
83 *
84 * <pre>
85 *
86 *
87 *
88 *
89 *
90 *
91 *
92 *
93 *
94 *
95 *
96 *
97 * item1 item2 weight
98 * item1 item5 weight
99 *
100 *
101 *
102 *
103 *
104 *
105 *
106 *
107 *
108 *
109 *
110 *
111 * </pre>
112 *
113 * <p>
114 * IMPORTANT: By definition the resulting matrix is square and symmetric, even if the symmetric edges are not
115 * explicitly listed.
116 *
117 * @param name of file
118 * @return
119 */
120 public NamedMatrix readFromAdjList( String fileName ) throws IOException {
121 if ( !FileTools.testFile( fileName ) ) {
122 throw new IOException( "Could not read from file " + fileName );
123 }
124 FileInputStream stream = new FileInputStream( fileName );
125 return readFromAdjList( stream );
126 }
127
128 /***
129 * @throws IOException
130 * @throws NumberFormatException Read a sparse symmetric square matrix that is expressed as an adjacency list in a
131 * tab-delimited file:
132 *
133 * <pre>
134 *
135 *
136 *
137 *
138 *
139 *
140 *
141 *
142 *
143 *
144 *
145 *
146 * item1 item2 weight
147 * item1 item5 weight
148 *
149 *
150 *
151 *
152 *
153 *
154 *
155 *
156 *
157 *
158 *
159 *
160 * </pre>
161 *
162 * <p>
163 * IMPORTANT: By definition the resulting matrix is square and symmetric, even if the symmetric edges are not
164 * explicitly listed.
165 * @param stream
166 * @return
167 */
168 public NamedMatrix readFromAdjList( InputStream stream )
169 throws NumberFormatException, IOException {
170 Set itemNames = new HashSet();
171 Map rows = new HashMap();
172
173 BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
174
175 OpenIntObjectHashMap indexNameMap = new OpenIntObjectHashMap();
176 Map nameIndexMap = new HashMap();
177
178
179
180
181
182 String row;
183 int index = 0;
184 while ( ( row = dis.readLine() ) != null ) {
185 StringTokenizer st = new StringTokenizer( row, " \t", false );
186
187 String itemA = "";
188 if ( st.hasMoreTokens() ) {
189 itemA = st.nextToken();
190 if ( !itemNames.contains( itemA ) ) {
191 rows.put( itemA, new OpenIntDoubleHashMap() );
192 itemNames.add( itemA );
193 indexNameMap.put( index, itemA );
194 nameIndexMap.put( itemA, new Integer( index ) );
195 ( ( OpenIntDoubleHashMap ) rows.get( itemA ) ).put( index, 0 );
196 index++;
197 }
198 } else
199 continue;
200
201 String itemB = "";
202 if ( st.hasMoreTokens() ) {
203 itemB = st.nextToken();
204 if ( !itemNames.contains( itemB ) ) {
205 rows.put( itemB, new OpenIntDoubleHashMap() );
206 itemNames.add( itemB );
207 indexNameMap.put( index, itemB );
208 nameIndexMap.put( itemB, new Integer( index ) );
209 ( ( OpenIntDoubleHashMap ) rows.get( itemB ) ).put( index, 0 );
210 index++;
211 }
212 } else
213 continue;
214
215 double weight;
216 if ( st.hasMoreTokens() ) {
217 weight = Double.parseDouble( st.nextToken() );
218 } else {
219 weight = 1.0;
220 }
221
222 int aind = ( ( Integer ) nameIndexMap.get( itemA ) ).intValue();
223 int bind = ( ( Integer ) nameIndexMap.get( itemB ) ).intValue();
224
225
226
227
228 ( ( OpenIntDoubleHashMap ) rows.get( itemA ) ).put( bind, weight );
229 ( ( OpenIntDoubleHashMap ) rows.get( itemB ) ).put( aind, weight );
230
231 if ( ( rows.size() % 500 ) == 0 ) {
232 log.info( new String( "loading " + index + "th pair" ) );
233 }
234 }
235 dis.close();
236
237 SparseRaggedDoubleMatrix2DNamed matrix = new SparseRaggedDoubleMatrix2DNamed();
238
239 for ( int i = 0; i < indexNameMap.size(); i++ ) {
240 String itemName = ( String ) indexNameMap.get( i );
241
242 OpenIntDoubleHashMap arow = ( OpenIntDoubleHashMap ) rows
243 .get( itemName );
244
245 DoubleArrayList finalValues = new DoubleArrayList( arow.size() );
246
247
248 IntArrayList inB = arow.keys();
249 inB.sort();
250 int[] rowMemberIndexes = inB.elements();
251
252
253 for ( int j = 0; j < rowMemberIndexes.length; j++ ) {
254 int itemNumber = rowMemberIndexes[j];
255 double weight = arow.get( itemNumber );
256 finalValues.add( weight );
257 }
258
259 DoubleMatrix1D rowMatrix = new RCDoubleMatrix1D( inB, finalValues );
260 matrix.addRow( itemName, rowMatrix );
261
262 if ( i > 0 && ( i % 500 ) == 0 ) {
263 log.info( new String( "Adding " + i + "th row" ) );
264 }
265 }
266 return matrix;
267 }
268
269
270
271
272
273
274 public NamedMatrix read( String fileName ) throws IOException {
275 if ( !FileTools.testFile( fileName ) ) {
276 throw new IOException( "Could not read from file " + fileName );
277 }
278 FileInputStream stream = new FileInputStream( fileName );
279 return read( stream );
280 }
281
282 public NamedMatrix readOneRow( BufferedReader dis ) throws IOException {
283 return this.readOneRow( dis, 0 );
284 }
285
286 /***
287 * Use this to read one row from a matrix (JW format). It does not close the reader. (this actually has to read
288 * several lines to get the data for one matrix row)
289 *
290 * @param stream
291 * @param offset A value indicating the lowest value for the indexes listed. This is here in case the indexes in the
292 * stream are numbered starting from 1 instead of zero.
293 * @return @throws IOException
294 */
295 public NamedMatrix readOneRow( BufferedReader dis, int offset )
296 throws IOException {
297 SparseRaggedDoubleMatrix2DNamed returnVal = new SparseRaggedDoubleMatrix2DNamed();
298
299 String row = dis.readLine();
300 StringTokenizer tok = new StringTokenizer( row, " \t" );
301
302 int index = Integer.parseInt( tok.nextToken() );
303 int amount = Integer.parseInt( tok.nextToken() );
304 String rowName = new Integer( index ).toString();
305 returnVal.addRow( rowName, readOneRow( dis, amount, offset ) );
306 return returnVal;
307 }
308
309 /***
310 * Read an entire sparse matrix from a stream (JW format).
311 *
312 * @param stream
313 * @return @throws IOException
314 */
315 public NamedMatrix read( InputStream stream ) throws IOException {
316 return this.read( stream, 0 );
317 }
318
319 /***
320 * Read an entire sparse matrix from a stream (JW format).
321 *
322 * @param stream
323 * @param offset A value indicating the lowest value for the indexes listed. This is here in case the indexes in the
324 * stream are numbered starting from 1 instead of zero.
325 * @return @throws IOException
326 */
327 public NamedMatrix read( InputStream stream, int offset ) throws IOException {
328 BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
329 SparseRaggedDoubleMatrix2DNamed returnVal = new SparseRaggedDoubleMatrix2DNamed();
330
331 String row;
332 int k = 1;
333
334 while ( ( row = dis.readLine() ) != null ) {
335
336 if ( row.equals( "" ) ) {
337 continue;
338 }
339
340 StringTokenizer tok = new StringTokenizer( row, " \t" );
341 if ( tok.countTokens() != 2 ) {
342 continue;
343 }
344
345 int index = Integer.parseInt( tok.nextToken() ) - offset;
346 int amount = Integer.parseInt( tok.nextToken() );
347
348 if ( ( index % 500 ) == 0 ) {
349 log.info( new String( "loading " + index + "th entry" ) );
350 }
351
352 returnVal.addRow( new Integer( k ).toString(), readOneRow( dis,
353 amount, offset ) );
354
355 k++;
356 }
357
358 dis.close();
359 return returnVal;
360 }
361
362 private DoubleMatrix1D readOneRow( BufferedReader dis, int amount, int offset )
363 throws IOException {
364
365
366
367
368
369 String rowInd = "";
370 String rowWei = "";
371
372
373 rowInd = dis.readLine();
374
375
376
377 rowWei = dis.readLine();
378
379
380 StringTokenizer tokw = new StringTokenizer( rowWei, " \t" );
381 StringTokenizer toki = new StringTokenizer( rowInd, " \t" );
382
383 OpenIntIntHashMap map = new OpenIntIntHashMap( amount, 0.4, 0.8 );
384 DoubleArrayList values = new DoubleArrayList( amount );
385 DoubleArrayList finalValues = new DoubleArrayList( amount );
386
387 int i = 0;
388 while ( toki.hasMoreTokens() ) {
389
390 double weight = Double.parseDouble( tokw.nextToken() );
391 int ind = Integer.parseInt( toki.nextToken() ) - offset;
392
393 if ( ind < 0 ) {
394 throw new IllegalStateException(
395 "Can't have negative index - check offset." );
396 }
397
398 map.put( ind, i );
399 values.add( weight );
400 i++;
401 }
402
403 IntArrayList indexes = map.keys();
404 indexes.sort();
405 int[] ix = indexes.elements();
406 int size = ix.length;
407 for ( int j = 0; j < size; j++ ) {
408 finalValues.add( values.get( map.get( ix[j] ) ) );
409 }
410
411 return new RCDoubleMatrix1D( indexes, finalValues );
412 }
413
414 }