1 package baseCode.io.reader;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileInputStream;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.util.Collections;
10 import java.util.HashMap;
11 import java.util.HashSet;
12 import java.util.Iterator;
13 import java.util.Map;
14 import java.util.Set;
15 import java.util.StringTokenizer;
16 import java.util.Vector;
17
18 import baseCode.dataStructure.matrix.NamedMatrix;
19 import baseCode.dataStructure.matrix.SparseDoubleMatrix2DNamed;
20
21 import com.braju.beta.format.Format;
22 import com.braju.beta.format.FormatReader;
23 import com.braju.beta.format.FormatString;
24 import com.braju.beta.format.Parameters;
25 import com.braju.beta.lang.DoubleVariable;
26 import com.braju.beta.lang.IntegerVariable;
27 import com.braju.beta.lang.NumberVariable;
28
29 /***
30 * <hr>
31 * <p>
32 * Copyright (c) 2004 Columbia University
33 *
34 * @author pavlidis
35 * @version $Id: SparseDoubleMatrixReader.java,v 1.4 2004/08/16 02:06:01 pavlidis Exp $
36 */
37 public class SparseDoubleMatrixReader extends AbstractNamedMatrixReader {
38
39 /***
40 * @param filename data file to read from
41 * @return NamedMatrix object constructed from the data file
42 * @throws IOException
43 */
44 public NamedMatrix read( String filename ) throws IOException {
45 return read( filename, null );
46 }
47
48 /***
49 * @param stream InputStream stream to read from
50 * @return NamedMatrix object constructed from the data file
51 * @throws IOException
52 */
53 public NamedMatrix read( InputStream stream ) throws IOException {
54 return read( stream, null );
55 }
56
57 /***
58 * Read a matrix from a file, subject to filtering criteria.
59 *
60 * @param filename data file to read from
61 * @param wantedRowNames contains names of rows we want to get
62 * @return NamedMatrix object constructed from the data file
63 * @throws IOException
64 */
65 public NamedMatrix read( String filename, Set wantedRowNames )
66 throws IOException {
67 File infile = new File( filename );
68 if ( !infile.exists() || !infile.canRead() ) {
69 throw new IOException( "Could not read from file " + filename );
70 }
71 FileInputStream stream = new FileInputStream( infile );
72 return read( stream, wantedRowNames );
73 }
74
75 /***
76 * Read a sparse matrix in "JW" (Jason Weston) format. The format is like this:
77 *
78 * <pre>
79 * 2 <--- number of items - the first line of the file only. NOTE - this line is often blank or not present.
80 * 1 2 <--- items 1 has 2 edges
81 * 1 2 <--- edge indices are to items 1 & 2
82 * 0.1 100 <--- with the following weights
83 * 2 2 <--- items 2 also has 2 edges
84 * 1 2 <--- edge indices are also to items 1 & 2 (fully connected)
85 * 100 0.1 <--- with the following weights
86 * </pre>
87 *
88 * <p>
89 * Note that the item numbering starts at 1. This is a requirement.
90 * <p>
91 * Note that this cannot handle very large matrices - the limit to rows x columns is the number Integer.MAX_VALUE.
92 * This is an implementation problem for colt's sparse matrix.
93 *
94 * @param stream
95 * @param wantedRowNames
96 * @return @throws IOException
97 */
98 public NamedMatrix readJW( InputStream stream ) throws IOException,
99 IllegalAccessException, NoSuchFieldException {
100
101 BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
102 FormatReader ff = new FormatReader( dis );
103 Parameters p = new Parameters();
104 FormatString fmtdd = ff.compileFormatString( "%d %d" );
105 FormatString fmtd = ff.compileFormatString( "%d" );
106 FormatString fmtg = ff.compileFormatString( "%g" );
107
108 NumberVariable index = new IntegerVariable();
109 NumberVariable amount = new IntegerVariable();
110 DoubleVariable eval = new DoubleVariable();
111
112 int dim = 10;
113
114 dim = new Integer( Integer.parseInt( dis.readLine() ) ).intValue();
115 SparseDoubleMatrix2DNamed returnVal = new SparseDoubleMatrix2DNamed( dim,
116 dim );
117
118 for ( int k = 1; k <= dim; k++ ) {
119
120 returnVal.addColumnName( new Integer( k ).toString(), k - 1 );
121 returnVal.addRowName( new Integer( k ).toString(), k - 1 );
122
123 ff.read( fmtdd, p.add( index ).add( amount ) );
124
125 if ( ( index.intValue() % 500 ) == 0 ) {
126 log.debug( Format.sprintf(
127 "loading %2.1f%% complete (%dth entry)... \n", p.add(
128 100.0 * ( ( index.intValue() ) / ( ( float ) dim ) ) )
129 .add( index ) ) );
130 }
131
132 int[] rowind = new int[amount.intValue()];
133 for ( int i = 0; i < amount.intValue(); i++ ) {
134
135 Format.fscanf( ff, fmtd, p.add( index ) );
136 int ind = index.intValue();
137
138 if ( ind > dim || ind < 1 ) {
139 throw new IllegalStateException( "Illegal value " + ind
140 + " found in index list for item " + k );
141 }
142 rowind[i] = ind;
143 }
144
145 for ( int i = 0; i < amount.intValue(); i++ ) {
146 Format.fscanf( ff, fmtg, p.add( eval ) );
147 returnVal.setQuick( k - 1, rowind[i] - 1, eval.doubleValue() );
148
149 }
150
151 }
152 ff.close();
153 return returnVal;
154 }
155
156 /***
157 * Read a sparse matrix that is expressed as an adjacency list in a tab-delimited file:
158 *
159 * <pre>
160 * item1 item2 weight
161 * item1 item5 weight
162 * </pre>
163 *
164 * <p>
165 * By definition the resulting matrix is square and symmetric.
166 * </p>
167 * <p>
168 * Note that the ordering of the items will be as they are encountered in the file.
169 *
170 * @param stream InputStream
171 * @param wantedRowNames Set
172 * @return NamedMatrix
173 * @throws IOException
174 */
175 public NamedMatrix read( InputStream stream, Set wantedRowNames )
176 throws IOException {
177
178 Set itemNames = new HashSet();
179 Map rows = new HashMap();
180
181 BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
182
183 String row;
184 int index = 0;
185 Map nameIndexMap = new HashMap();
186 while ( ( row = dis.readLine() ) != null ) {
187 StringTokenizer st = new StringTokenizer( row, " \t", false );
188
189 String itemA = "";
190
191 if ( st.hasMoreTokens() ) {
192 itemA = st.nextToken();
193
194 if ( !itemNames.contains( itemA ) ) {
195 rows.put( itemA, new HashSet() );
196 itemNames.add( itemA );
197 nameIndexMap.put( itemA, new Integer( index ) );
198 index++;
199 }
200 } else {
201
202 }
203
204 String itemB = "";
205 if ( st.hasMoreTokens() ) {
206 itemB = st.nextToken();
207 if ( !itemNames.contains( itemB ) ) {
208 rows.put( itemB, new HashSet() );
209 itemNames.add( itemB );
210 nameIndexMap.put( itemB, new Integer( index ) );
211 index++;
212 }
213 } else {
214
215 }
216
217 double weight;
218 if ( st.hasMoreTokens() ) {
219 weight = Double.parseDouble( st.nextToken() );
220 } else {
221 weight = 1.0;
222 }
223
224 ( ( Set ) rows.get( itemA ) ).add( new IndexScoreDyad(
225 ( ( Integer ) nameIndexMap.get( itemB ) ).intValue(), weight ) );
226 ( ( Set ) rows.get( itemB ) ).add( new IndexScoreDyad(
227 ( ( Integer ) nameIndexMap.get( itemA ) ).intValue(), weight ) );
228 }
229
230 SparseDoubleMatrix2DNamed matrix = new SparseDoubleMatrix2DNamed(
231 itemNames.size(), itemNames.size() );
232
233 Vector itemVec = new Vector( itemNames );
234 Collections.sort(itemVec);
235
236 matrix.setColumnNames( itemVec );
237 matrix.setRowNames( itemVec );
238 for ( Iterator iter = itemNames.iterator(); iter.hasNext(); ) {
239 String itemA = ( String ) iter.next();
240 int rowIndex = matrix.getRowIndexByName( itemA );
241 Set arow = ( Set ) rows.get( itemA );
242 for ( Iterator iterator = arow.iterator(); iterator.hasNext(); ) {
243 IndexScoreDyad element = ( IndexScoreDyad ) iterator.next();
244 int ind = element.getKey();
245 double weight = element.getValue();
246
247 matrix.setQuick( rowIndex, ind, weight );
248 matrix.setQuick( ind, rowIndex, weight );
249 }
250
251 }
252
253 dis.close();
254 return matrix;
255 }
256
257
258
259
260 public NamedMatrix readOneRow( BufferedReader dis ) throws IOException {
261
262
263 throw new UnsupportedOperationException();
264 }
265
266 }
267
268