|
|||||||||||||||||||
| 30 day Evaluation Version distributed via the Maven Jar Repository. Clover is not free. You have 30 days to evaluate it. Please visit http://www.thecortex.net/clover to obtain a licensed version of Clover | |||||||||||||||||||
| Source file | Conditionals | Statements | Methods | TOTAL | |||||||||||||||
| SparseRaggedDouble2DNamedMatrixReader.java | 65.8% | 80.8% | 37.5% | 75.3% |
|
||||||||||||||
| 1 |
package baseCode.io.reader;
|
|
| 2 |
|
|
| 3 |
import java.io.BufferedReader;
|
|
| 4 |
import java.io.FileInputStream;
|
|
| 5 |
import java.io.IOException;
|
|
| 6 |
import java.io.InputStream;
|
|
| 7 |
import java.io.InputStreamReader;
|
|
| 8 |
import java.util.HashMap;
|
|
| 9 |
import java.util.HashSet;
|
|
| 10 |
import java.util.Map;
|
|
| 11 |
import java.util.Set;
|
|
| 12 |
import java.util.StringTokenizer;
|
|
| 13 |
|
|
| 14 |
import baseCode.dataStructure.matrix.NamedMatrix;
|
|
| 15 |
import baseCode.dataStructure.matrix.RCDoubleMatrix1D;
|
|
| 16 |
import baseCode.dataStructure.matrix.SparseRaggedDoubleMatrix2DNamed;
|
|
| 17 |
import baseCode.util.FileTools;
|
|
| 18 |
import cern.colt.list.DoubleArrayList;
|
|
| 19 |
import cern.colt.list.IntArrayList;
|
|
| 20 |
import cern.colt.map.OpenIntDoubleHashMap;
|
|
| 21 |
import cern.colt.map.OpenIntIntHashMap;
|
|
| 22 |
import cern.colt.map.OpenIntObjectHashMap;
|
|
| 23 |
import cern.colt.matrix.DoubleMatrix1D;
|
|
| 24 |
|
|
| 25 |
/**
|
|
| 26 |
* Best data structure for reading really big, really sparse matrices when a matrix represetation is needed. *
|
|
| 27 |
* <p>
|
|
| 28 |
* The standard format looks like this:
|
|
| 29 |
*
|
|
| 30 |
* <pre>
|
|
| 31 |
*
|
|
| 32 |
*
|
|
| 33 |
*
|
|
| 34 |
*
|
|
| 35 |
*
|
|
| 36 |
*
|
|
| 37 |
*
|
|
| 38 |
*
|
|
| 39 |
*
|
|
| 40 |
*
|
|
| 41 |
*
|
|
| 42 |
*
|
|
| 43 |
*
|
|
| 44 |
*
|
|
| 45 |
*
|
|
| 46 |
*
|
|
| 47 |
* 2 <--- number of items - the first line of the file only. NOTE - this line is often blank or not present.
|
|
| 48 |
* 1 2 <--- items 1 has 2 edges
|
|
| 49 |
* 1 2 <--- edge indices are to items 1 & 2
|
|
| 50 |
* 0.1 100 <--- with the following weights
|
|
| 51 |
* 2 2 <--- items 2 also has 2 edges
|
|
| 52 |
* 1 2 <--- edge indices are also to items 1 & 2 (fully connected)
|
|
| 53 |
* 100 0.1 <--- with the following weights
|
|
| 54 |
*
|
|
| 55 |
*
|
|
| 56 |
*
|
|
| 57 |
*
|
|
| 58 |
*
|
|
| 59 |
*
|
|
| 60 |
*
|
|
| 61 |
*
|
|
| 62 |
*
|
|
| 63 |
*
|
|
| 64 |
*
|
|
| 65 |
*
|
|
| 66 |
*
|
|
| 67 |
*
|
|
| 68 |
*
|
|
| 69 |
* </pre>
|
|
| 70 |
*
|
|
| 71 |
* <hr>
|
|
| 72 |
* <p>
|
|
| 73 |
* Copyright (c) 2004 Columbia University
|
|
| 74 |
*
|
|
| 75 |
* @author pavlidis
|
|
| 76 |
* @version $Id: SparseRaggedDouble2DNamedMatrixReader.java,v 1.22 2005/01/05 02:01:02 pavlidis Exp $
|
|
| 77 |
*/
|
|
| 78 |
public class SparseRaggedDouble2DNamedMatrixReader extends |
|
| 79 |
AbstractNamedMatrixReader {
|
|
| 80 |
|
|
| 81 |
/**
|
|
| 82 |
* Read a sparse symmetric square matrix that is expressed as an adjacency list in a tab-delimited file:
|
|
| 83 |
*
|
|
| 84 |
* <pre>
|
|
| 85 |
*
|
|
| 86 |
*
|
|
| 87 |
*
|
|
| 88 |
*
|
|
| 89 |
*
|
|
| 90 |
*
|
|
| 91 |
*
|
|
| 92 |
*
|
|
| 93 |
*
|
|
| 94 |
*
|
|
| 95 |
*
|
|
| 96 |
*
|
|
| 97 |
* item1 item2 weight
|
|
| 98 |
* item1 item5 weight
|
|
| 99 |
*
|
|
| 100 |
*
|
|
| 101 |
*
|
|
| 102 |
*
|
|
| 103 |
*
|
|
| 104 |
*
|
|
| 105 |
*
|
|
| 106 |
*
|
|
| 107 |
*
|
|
| 108 |
*
|
|
| 109 |
*
|
|
| 110 |
*
|
|
| 111 |
* </pre>
|
|
| 112 |
*
|
|
| 113 |
* <p>
|
|
| 114 |
* IMPORTANT: By definition the resulting matrix is square and symmetric, even if the symmetric edges are not
|
|
| 115 |
* explicitly listed.
|
|
| 116 |
*
|
|
| 117 |
* @param name of file
|
|
| 118 |
* @return
|
|
| 119 |
*/
|
|
| 120 | 0 |
public NamedMatrix readFromAdjList( String fileName ) throws IOException { |
| 121 | 0 |
if ( !FileTools.testFile( fileName ) ) {
|
| 122 | 0 |
throw new IOException( "Could not read from file " + fileName ); |
| 123 |
} |
|
| 124 | 0 |
FileInputStream stream = new FileInputStream( fileName );
|
| 125 | 0 |
return readFromAdjList( stream );
|
| 126 |
} |
|
| 127 |
|
|
| 128 |
/**
|
|
| 129 |
* @throws IOException
|
|
| 130 |
* @throws NumberFormatException Read a sparse symmetric square matrix that is expressed as an adjacency list in a
|
|
| 131 |
* tab-delimited file:
|
|
| 132 |
*
|
|
| 133 |
* <pre>
|
|
| 134 |
*
|
|
| 135 |
*
|
|
| 136 |
*
|
|
| 137 |
*
|
|
| 138 |
*
|
|
| 139 |
*
|
|
| 140 |
*
|
|
| 141 |
*
|
|
| 142 |
*
|
|
| 143 |
*
|
|
| 144 |
*
|
|
| 145 |
*
|
|
| 146 |
* item1 item2 weight
|
|
| 147 |
* item1 item5 weight
|
|
| 148 |
*
|
|
| 149 |
*
|
|
| 150 |
*
|
|
| 151 |
*
|
|
| 152 |
*
|
|
| 153 |
*
|
|
| 154 |
*
|
|
| 155 |
*
|
|
| 156 |
*
|
|
| 157 |
*
|
|
| 158 |
*
|
|
| 159 |
*
|
|
| 160 |
* </pre>
|
|
| 161 |
*
|
|
| 162 |
* <p>
|
|
| 163 |
* IMPORTANT: By definition the resulting matrix is square and symmetric, even if the symmetric edges are not
|
|
| 164 |
* explicitly listed.
|
|
| 165 |
* @param stream
|
|
| 166 |
* @return
|
|
| 167 |
*/
|
|
| 168 | 2 |
public NamedMatrix readFromAdjList( InputStream stream )
|
| 169 |
throws NumberFormatException, IOException {
|
|
| 170 | 2 |
Set itemNames = new HashSet();
|
| 171 | 2 |
Map rows = new HashMap();
|
| 172 |
|
|
| 173 | 2 |
BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) ); |
| 174 |
|
|
| 175 | 2 |
OpenIntObjectHashMap indexNameMap = new OpenIntObjectHashMap(); // eventual row index --> name |
| 176 | 2 |
Map nameIndexMap = new HashMap(); // name --> eventual row index |
| 177 |
|
|
| 178 |
/*
|
|
| 179 |
* Store the information about the matrix in a temporary set of data structures, the most important of which is a
|
|
| 180 |
* map of nodes to edge information. Each edge information object contains the index and the weight of the edge.
|
|
| 181 |
*/
|
|
| 182 | 2 |
String row; |
| 183 | 2 |
int index = 0;
|
| 184 | ? |
while ( ( row = dis.readLine() ) != null ) { |
| 185 | 11742 |
StringTokenizer st = new StringTokenizer( row, " \t", false ); |
| 186 |
|
|
| 187 | 11742 |
String itemA = "";
|
| 188 | 11742 |
if ( st.hasMoreTokens() ) {
|
| 189 | 11742 |
itemA = st.nextToken(); |
| 190 | 11742 |
if ( !itemNames.contains( itemA ) ) {
|
| 191 | 490 |
rows.put( itemA, new OpenIntDoubleHashMap() );
|
| 192 | 490 |
itemNames.add( itemA ); |
| 193 | 490 |
indexNameMap.put( index, itemA ); |
| 194 | 490 |
nameIndexMap.put( itemA, new Integer( index ) );
|
| 195 | 490 |
( ( OpenIntDoubleHashMap ) rows.get( itemA ) ).put( index, 0 ); // to itself. - in case it isn't there.
|
| 196 | 490 |
index++; |
| 197 |
} |
|
| 198 |
} else
|
|
| 199 | 0 |
continue;
|
| 200 |
|
|
| 201 | 11742 |
String itemB = "";
|
| 202 | 11742 |
if ( st.hasMoreTokens() ) {
|
| 203 | 11742 |
itemB = st.nextToken(); |
| 204 | 11742 |
if ( !itemNames.contains( itemB ) ) {
|
| 205 | 1271 |
rows.put( itemB, new OpenIntDoubleHashMap() );
|
| 206 | 1271 |
itemNames.add( itemB ); |
| 207 | 1271 |
indexNameMap.put( index, itemB ); |
| 208 | 1271 |
nameIndexMap.put( itemB, new Integer( index ) );
|
| 209 | 1271 |
( ( OpenIntDoubleHashMap ) rows.get( itemB ) ).put( index, 0 ); // to itself. - in case it isn't there.
|
| 210 | 1271 |
index++; |
| 211 |
} |
|
| 212 |
} else
|
|
| 213 | 0 |
continue;
|
| 214 |
|
|
| 215 | 11742 |
double weight;
|
| 216 | 11742 |
if ( st.hasMoreTokens() ) {
|
| 217 | 11742 |
weight = Double.parseDouble( st.nextToken() ); |
| 218 |
} else {
|
|
| 219 | 0 |
weight = 1.0; // just make it a binary matrix.
|
| 220 |
} |
|
| 221 |
|
|
| 222 | 11742 |
int aind = ( ( Integer ) nameIndexMap.get( itemA ) ).intValue();
|
| 223 | 11742 |
int bind = ( ( Integer ) nameIndexMap.get( itemB ) ).intValue();
|
| 224 |
|
|
| 225 |
// if (itemA.equals("CYP4A11") || itemB.equals("CYP4A11"))
|
|
| 226 |
// System.err.println( itemA + " " + itemB + " " + aind + " " + bind );
|
|
| 227 |
|
|
| 228 | 11742 |
( ( OpenIntDoubleHashMap ) rows.get( itemA ) ).put( bind, weight ); // link a to b.
|
| 229 | 11742 |
( ( OpenIntDoubleHashMap ) rows.get( itemB ) ).put( aind, weight ); // link b to a.
|
| 230 |
|
|
| 231 | 11742 |
if ( ( rows.size() % 500 ) == 0 ) {
|
| 232 | 3 |
log.info( new String( "loading " + index + "th pair" ) ); |
| 233 |
} |
|
| 234 |
} |
|
| 235 | 2 |
dis.close(); |
| 236 |
|
|
| 237 | 2 |
SparseRaggedDoubleMatrix2DNamed matrix = new SparseRaggedDoubleMatrix2DNamed();
|
| 238 |
|
|
| 239 | 2 |
for ( int i = 0; i < indexNameMap.size(); i++ ) { |
| 240 | 1761 |
String itemName = ( String ) indexNameMap.get( i ); |
| 241 |
|
|
| 242 | 1761 |
OpenIntDoubleHashMap arow = ( OpenIntDoubleHashMap ) rows |
| 243 |
.get( itemName ); |
|
| 244 |
|
|
| 245 | 1761 |
DoubleArrayList finalValues = new DoubleArrayList( arow.size() );
|
| 246 |
|
|
| 247 |
// System.err.println( itemName + " has " + arow.size() + " links" );
|
|
| 248 | 1761 |
IntArrayList inB = arow.keys(); |
| 249 | 1761 |
inB.sort(); |
| 250 | 1761 |
int[] rowMemberIndexes = inB.elements();
|
| 251 |
// System.err.println( itemName + " " + i + " " + inB );
|
|
| 252 |
|
|
| 253 | 1761 |
for ( int j = 0; j < rowMemberIndexes.length; j++ ) { |
| 254 | 25237 |
int itemNumber = rowMemberIndexes[j]; // keys |
| 255 | 25237 |
double weight = arow.get( itemNumber );
|
| 256 | 25237 |
finalValues.add( weight ); |
| 257 |
} |
|
| 258 |
|
|
| 259 | 1761 |
DoubleMatrix1D rowMatrix = new RCDoubleMatrix1D( inB, finalValues );
|
| 260 | 1761 |
matrix.addRow( itemName, rowMatrix ); |
| 261 |
|
|
| 262 | 1761 |
if ( i > 0 && ( i % 500 ) == 0 ) {
|
| 263 | 3 |
log.info( new String( "Adding " + i + "th row" ) ); |
| 264 |
} |
|
| 265 |
} |
|
| 266 | 2 |
return matrix;
|
| 267 |
} |
|
| 268 |
|
|
| 269 |
/*
|
|
| 270 |
* (non-Javadoc)
|
|
| 271 |
*
|
|
| 272 |
* @see baseCode.io.reader.AbstractNamedMatrixReader#read(java.lang.String)
|
|
| 273 |
*/
|
|
| 274 | 0 |
public NamedMatrix read( String fileName ) throws IOException { |
| 275 | 0 |
if ( !FileTools.testFile( fileName ) ) {
|
| 276 | 0 |
throw new IOException( "Could not read from file " + fileName ); |
| 277 |
} |
|
| 278 | 0 |
FileInputStream stream = new FileInputStream( fileName );
|
| 279 | 0 |
return read( stream );
|
| 280 |
} |
|
| 281 |
|
|
| 282 | 0 |
public NamedMatrix readOneRow( BufferedReader dis ) throws IOException { |
| 283 | 0 |
return this.readOneRow( dis, 0 ); |
| 284 |
} |
|
| 285 |
|
|
| 286 |
/**
|
|
| 287 |
* Use this to read one row from a matrix (JW format). It does not close the reader. (this actually has to read
|
|
| 288 |
* several lines to get the data for one matrix row)
|
|
| 289 |
*
|
|
| 290 |
* @param stream
|
|
| 291 |
* @param offset A value indicating the lowest value for the indexes listed. This is here in case the indexes in the
|
|
| 292 |
* stream are numbered starting from 1 instead of zero.
|
|
| 293 |
* @return @throws IOException
|
|
| 294 |
*/
|
|
| 295 | 0 |
public NamedMatrix readOneRow( BufferedReader dis, int offset ) |
| 296 |
throws IOException {
|
|
| 297 | 0 |
SparseRaggedDoubleMatrix2DNamed returnVal = new SparseRaggedDoubleMatrix2DNamed();
|
| 298 |
|
|
| 299 | 0 |
String row = dis.readLine(); // line containing the id and the number of edges.
|
| 300 | 0 |
StringTokenizer tok = new StringTokenizer( row, " \t" ); |
| 301 |
|
|
| 302 | 0 |
int index = Integer.parseInt( tok.nextToken() );
|
| 303 | 0 |
int amount = Integer.parseInt( tok.nextToken() );
|
| 304 | 0 |
String rowName = new Integer( index ).toString();
|
| 305 | 0 |
returnVal.addRow( rowName, readOneRow( dis, amount, offset ) ); |
| 306 | 0 |
return returnVal;
|
| 307 |
} |
|
| 308 |
|
|
| 309 |
/**
|
|
| 310 |
* Read an entire sparse matrix from a stream (JW format).
|
|
| 311 |
*
|
|
| 312 |
* @param stream
|
|
| 313 |
* @return @throws IOException
|
|
| 314 |
*/
|
|
| 315 | 0 |
public NamedMatrix read( InputStream stream ) throws IOException { |
| 316 | 0 |
return this.read( stream, 0 ); |
| 317 |
} |
|
| 318 |
|
|
| 319 |
/**
|
|
| 320 |
* Read an entire sparse matrix from a stream (JW format).
|
|
| 321 |
*
|
|
| 322 |
* @param stream
|
|
| 323 |
* @param offset A value indicating the lowest value for the indexes listed. This is here in case the indexes in the
|
|
| 324 |
* stream are numbered starting from 1 instead of zero.
|
|
| 325 |
* @return @throws IOException
|
|
| 326 |
*/
|
|
| 327 | 6 |
public NamedMatrix read( InputStream stream, int offset ) throws IOException { |
| 328 | 6 |
BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) ); |
| 329 | 6 |
SparseRaggedDoubleMatrix2DNamed returnVal = new SparseRaggedDoubleMatrix2DNamed();
|
| 330 |
|
|
| 331 | 6 |
String row; |
| 332 | 6 |
int k = 1;
|
| 333 |
|
|
| 334 | ? |
while ( ( row = dis.readLine() ) != null ) { |
| 335 |
|
|
| 336 | 24 |
if ( row.equals( "" ) ) { // in case there is a blank line at the top. |
| 337 | 0 |
continue;
|
| 338 |
} |
|
| 339 |
|
|
| 340 | 24 |
StringTokenizer tok = new StringTokenizer( row, " \t" ); |
| 341 | 24 |
if ( tok.countTokens() != 2 ) { // in case the row count is there. |
| 342 | 6 |
continue;
|
| 343 |
} |
|
| 344 |
|
|
| 345 | 18 |
int index = Integer.parseInt( tok.nextToken() ) - offset;
|
| 346 | 18 |
int amount = Integer.parseInt( tok.nextToken() );
|
| 347 |
|
|
| 348 | 18 |
if ( ( index % 500 ) == 0 ) {
|
| 349 | 6 |
log.info( new String( "loading " + index + "th entry" ) ); |
| 350 |
} |
|
| 351 |
|
|
| 352 | 18 |
returnVal.addRow( new Integer( k ).toString(), readOneRow( dis,
|
| 353 |
amount, offset ) ); |
|
| 354 |
|
|
| 355 | 18 |
k++; |
| 356 |
} |
|
| 357 |
|
|
| 358 | 6 |
dis.close(); |
| 359 | 6 |
return returnVal;
|
| 360 |
} |
|
| 361 |
|
|
| 362 | 18 |
private DoubleMatrix1D readOneRow( BufferedReader dis, int amount, int offset ) |
| 363 |
throws IOException {
|
|
| 364 |
|
|
| 365 |
/*
|
|
| 366 |
* we have to be careful to skip any lines that invalid. Each line should have at least two characters. In the
|
|
| 367 |
* files JW provided there are some lines that are just " ".
|
|
| 368 |
*/
|
|
| 369 | 18 |
String rowInd = "";
|
| 370 | 18 |
String rowWei = "";
|
| 371 |
|
|
| 372 |
// while ( rowInd.length() < 2 ) {
|
|
| 373 | 18 |
rowInd = dis.readLine(); // row with indices.
|
| 374 |
// }
|
|
| 375 |
|
|
| 376 |
// while ( rowWei.length() < 2 ) {
|
|
| 377 | 18 |
rowWei = dis.readLine(); // row with weights.
|
| 378 |
// }
|
|
| 379 |
|
|
| 380 | 18 |
StringTokenizer tokw = new StringTokenizer( rowWei, " \t" ); |
| 381 | 18 |
StringTokenizer toki = new StringTokenizer( rowInd, " \t" ); |
| 382 |
|
|
| 383 | 18 |
OpenIntIntHashMap map = new OpenIntIntHashMap( amount, 0.4, 0.8 );
|
| 384 | 18 |
DoubleArrayList values = new DoubleArrayList( amount );
|
| 385 | 18 |
DoubleArrayList finalValues = new DoubleArrayList( amount );
|
| 386 |
|
|
| 387 | 18 |
int i = 0;
|
| 388 | 18 |
while ( toki.hasMoreTokens() ) {
|
| 389 |
|
|
| 390 | 36 |
double weight = Double.parseDouble( tokw.nextToken() );
|
| 391 | 36 |
int ind = Integer.parseInt( toki.nextToken() ) - offset;
|
| 392 |
|
|
| 393 | 36 |
if ( ind < 0 ) {
|
| 394 | 0 |
throw new IllegalStateException( |
| 395 |
"Can't have negative index - check offset." );
|
|
| 396 |
} |
|
| 397 |
|
|
| 398 | 36 |
map.put( ind, i ); |
| 399 | 36 |
values.add( weight ); |
| 400 | 36 |
i++; |
| 401 |
} |
|
| 402 |
|
|
| 403 | 18 |
IntArrayList indexes = map.keys(); |
| 404 | 18 |
indexes.sort(); |
| 405 | 18 |
int[] ix = indexes.elements();
|
| 406 | 18 |
int size = ix.length;
|
| 407 | 18 |
for ( int j = 0; j < size; j++ ) { |
| 408 | 36 |
finalValues.add( values.get( map.get( ix[j] ) ) ); |
| 409 |
} |
|
| 410 |
|
|
| 411 | 18 |
return new RCDoubleMatrix1D( indexes, finalValues ); |
| 412 |
} |
|
| 413 |
|
|
| 414 |
} |
|
||||||||||