Clover coverage report - baseCode - 0.2.5
Coverage timestamp: Tue Apr 12 2005 11:31:58 EDT
file stats: LOC: 531   Methods: 12
NCLOC: 314   Classes: 1
30 day Evaluation Version distributed via the Maven Jar Repository. Clover is not free. You have 30 days to evaluate it. Please visit http://www.thecortex.net/clover to obtain a licensed version of Clover
 
 Source file Conditionals Statements Methods TOTAL
GeneSetMapTools.java 0% 0% 0% 0%
coverage
 1   
 package baseCode.bio.geneset;
 2   
 
 3   
 import hep.aida.IHistogram1D;
 4   
 import hep.aida.ref.Histogram1D;
 5   
 
 6   
 import java.util.ArrayList;
 7   
 import java.util.Collection;
 8   
 import java.util.Collections;
 9   
 import java.util.HashMap;
 10   
 import java.util.HashSet;
 11   
 import java.util.Iterator;
 12   
 import java.util.LinkedHashMap;
 13   
 import java.util.List;
 14   
 import java.util.Map;
 15   
 import java.util.Set;
 16   
 import java.util.Vector;
 17   
 
 18   
 import baseCode.util.StatusViewer;
 19   
 
 20   
 /**
 21   
  * Methods to 'clean' a set of geneSets - to remove redundancies, for example.
 22   
  * 
 23   
  * @author Paul Pavlidis
 24   
  * @version $Id: GeneSetMapTools.java,v 1.5 2004/10/13 21:58:46 pavlidis Exp $
 25   
  */
 26   
 public class GeneSetMapTools {
 27   
 
 28   
    /**
 29   
     * @param ga
 30   
     * @param countEmpty if false, gene sets that have no members are not counted in the total.
 31   
     * @return The average size of the gene sets.
 32   
     */
 33  0
    public static double meanGeneSetSize( GeneAnnotations ga, boolean countEmpty ) {
 34  0
       double sum = 0.0;
 35  0
       int n = 0;
 36   
 
 37  0
       Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
 38   
 
 39  0
       for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
 40   
             .hasNext(); ) {
 41  0
          String geneSet = ( String ) iter.next();
 42   
 
 43  0
          Collection element;
 44   
 
 45  0
          element = ( Collection ) geneSetToGeneMap.get( geneSet );
 46   
 
 47  0
          if ( !countEmpty && element.size() == 0 ) {
 48  0
             continue;
 49   
          }
 50   
 
 51  0
          sum += element.size();
 52  0
          n++;
 53   
       }
 54   
 
 55  0
       return sum / n;
 56   
 
 57   
    }
 58   
 
 59   
    /**
 60   
     * @param sum
 61   
     * @param ga
 62   
     * @param countEmpty if false ,genes that have no gene sets assigned to them are not counted in the total.
 63   
     * @return The average number of gene sets per gene (per probe actually). This is a measure of gene set overlap. If
 64   
     *         the value is 1, it means that each gene is (on average) in only one set. Large values indicate larger
 65   
     *         amounts of overelap between gene sets.
 66   
     */
 67  0
    public static double meanSetsPerGene( GeneAnnotations ga, boolean countEmpty ) {
 68  0
       double sum = 0.0;
 69  0
       int n = 0;
 70   
 
 71  0
       Map probeToSetMap = ga.getProbeToGeneSetMap();
 72   
 
 73  0
       for ( Iterator iter = probeToSetMap.keySet().iterator(); iter.hasNext(); ) {
 74  0
          String probe = ( String ) iter.next();
 75   
 
 76  0
          Collection element;
 77   
 
 78  0
          element = ( Collection ) probeToSetMap.get( probe );
 79   
 
 80  0
          if ( !countEmpty && element.size() == 0 ) {
 81  0
             continue;
 82   
          }
 83   
 
 84  0
          sum += element.size();
 85  0
          n++;
 86   
 
 87   
       }
 88   
 
 89  0
       return sum / n;
 90   
 
 91   
    }
 92   
 
 93  0
    public static IHistogram1D geneSetSizeDistribution( GeneAnnotations ga,
 94   
          int numBins, int minSize, int maxSize ) {
 95  0
       Histogram1D hist = new Histogram1D( "Distribution of gene set sizes",
 96   
             numBins, minSize, maxSize );
 97   
 
 98  0
       Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
 99   
 
 100  0
       for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
 101   
             .hasNext(); ) {
 102  0
          String geneSet = ( String ) iter.next();
 103   
 
 104  0
          Collection element;
 105   
 
 106  0
          element = ( Collection ) geneSetToGeneMap.get( geneSet );
 107  0
          hist.fill( element.size() );
 108   
       }
 109   
 
 110  0
       return hist;
 111   
 
 112   
    }
 113   
 
 114   
    /**
 115   
     * @param ga
 116   
     * @param gon
 117   
     * @param messenger
 118   
     * @param aspect
 119   
     */
 120  0
    public static void removeAspect( GeneAnnotations ga, GONames gon,
 121   
          StatusViewer messenger, String aspect ) {
 122  0
       if ( !( aspect.equals( "molecular_function" )
 123   
             || aspect.equals( "biological_process" ) || aspect
 124   
             .equals( "cellular_component" ) ) ) {
 125  0
          throw new IllegalArgumentException( "Unknown aspect requested" );
 126   
       }
 127   
 
 128  0
       Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
 129   
 
 130  0
       Set removeUs = new HashSet();
 131  0
       for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
 132   
             .hasNext(); ) {
 133  0
          String geneSet = ( String ) iter.next();
 134   
 
 135  0
          if ( gon.getAspectForId( geneSet ).equals( aspect ) ) {
 136  0
             removeUs.add( geneSet );
 137   
          }
 138   
 
 139   
       }
 140   
 
 141  0
       for ( Iterator iter = removeUs.iterator(); iter.hasNext(); ) {
 142  0
          String geneSet = ( String ) iter.next();
 143  0
          ga.removeClassFromMaps( geneSet );
 144   
       }
 145   
 
 146  0
       ga.resetSelectedSets();
 147  0
       ga.sortGeneSets();
 148   
 
 149  0
       if ( messenger != null ) {
 150  0
          messenger.setStatus( "There are now " + ga.numGeneSets()
 151   
                + " sets remaining after removing aspect " + aspect );
 152   
       }
 153   
 
 154   
    }
 155   
 
 156   
    /**
 157   
     * Remove gene sets that don't meet certain criteria.
 158   
     * 
 159   
     * @param ga
 160   
     * @param messenger
 161   
     * @param minClassSize
 162   
     * @param maxClassSize
 163   
     */
 164  0
    public static void removeBySize( GeneAnnotations ga, StatusViewer messenger,
 165   
          int minClassSize, int maxClassSize ) {
 166   
 
 167  0
       Map geneSetToGeneMap = ga.getGeneSetToGeneMap();
 168   
 
 169  0
       Set removeUs = new HashSet();
 170  0
       for ( Iterator iter = geneSetToGeneMap.keySet().iterator(); iter
 171   
             .hasNext(); ) {
 172  0
          String geneSet = ( String ) iter.next();
 173   
 
 174  0
          Set element;
 175  0
          element = ( Set ) geneSetToGeneMap.get( geneSet );
 176  0
          if ( element.size() < minClassSize || element.size() > maxClassSize ) {
 177  0
             removeUs.add( geneSet );
 178   
          }
 179   
 
 180   
       }
 181   
 
 182  0
       for ( Iterator iter = removeUs.iterator(); iter.hasNext(); ) {
 183  0
          String geneSet = ( String ) iter.next();
 184  0
          ga.removeClassFromMaps( geneSet );
 185   
       }
 186   
 
 187  0
       ga.resetSelectedSets();
 188  0
       ga.sortGeneSets();
 189   
 
 190  0
       if ( messenger != null ) {
 191  0
          messenger.setStatus( "There are now " + ga.numGeneSets()
 192   
                + " sets remaining after removing sets with excluded sizes." );
 193   
       }
 194   
 
 195   
    }
 196   
 
 197   
    /**
 198   
     * <p>
 199   
     * Remove classes which are too similar to some other class. In addition, the user can select a penalty for large
 200   
     * gene sets. Thus when two gene sets are found to be similar, the decision of which one to keep can be tuned based
 201   
     * on the size penalty. We find it useful to penalize large gene sets so we tend to keep smaller ones (but not too
 202   
     * small). Useful values of the penalty are above 1 (a value of 1 will result in the larger class always being
 203   
     * retained).
 204   
     * </p>
 205   
     * <p>
 206   
     * The amount of similarity to be tolerated is set by the parameter fractionSameThreshold, representing the fraction
 207   
     * of genes in the smaller class which are also found in the larger class. Thus, setting this threshold to be 0.0
 208   
     * means that no overlap is tolerated. Setting it to 1 means that classes will never be discarded.
 209   
     * </p>
 210   
     * 
 211   
     * @param fractionSameThreshold A value between 0 and 1, indicating how similar a class must be before it gets
 212   
     *        ditched.
 213   
     * @param ga
 214   
     * @param messenger For updating a log.
 215   
     * @param maxClassSize Large class considered. (that doesn't mean they are removed)
 216   
     * @param minClassSize Smallest class considered. (that doesn't mean they are removed)
 217   
     * @param bigClassPenalty A value greater or equal to one, indicating the cost of retaining a larger class in favor
 218   
     *        of a smaller one. The penalty is scaled with the difference in sizes of the two classes being considered,
 219   
     *        so very large classes are more heavily penalized.
 220   
     */
 221  0
    public static void ignoreSimilar( double fractionSameThreshold,
 222   
          GeneAnnotations ga, StatusViewer messenger, int maxClassSize,
 223   
          int minClassSize, double bigClassPenalty ) {
 224   
 
 225  0
       Map classesToSimilarMap = new LinkedHashMap();
 226  0
       Set seenit = new HashSet();
 227  0
       Set deleteUs = new HashSet();
 228   
 
 229  0
       if ( messenger != null ) {
 230  0
          messenger.setStatus( "...Highly (" + fractionSameThreshold * 100
 231   
                + "%)  similar classes are being removed..." + ga.numGeneSets()
 232   
                + " to start..." );
 233   
       }
 234   
 
 235   
       // iterate over all the classes, starting from the smallest one.
 236   
       //      List sortedList = ga.sortGeneSetsBySize();
 237  0
       List sortedList = new ArrayList( ga.getGeneSetToGeneMap().keySet() );
 238  0
       Collections.shuffle( sortedList );
 239   
 
 240   
       // OUTER - compare all classes to each other.
 241  0
       for ( Iterator iter = sortedList.iterator(); iter.hasNext(); ) {
 242  0
          String queryClassId = ( String ) iter.next();
 243  0
          Set queryClass = ( Set ) ga.getGeneSetToGeneMap().get( queryClassId );
 244   
 
 245  0
          int querySize = queryClass.size();
 246   
 
 247  0
          if ( seenit.contains( queryClassId ) || querySize > maxClassSize
 248   
                || querySize < minClassSize ) {
 249  0
             continue;
 250   
          }
 251   
 
 252  0
          seenit.add( queryClassId );
 253   
 
 254   
          // INNER
 255  0
          for ( Iterator iterb = sortedList.iterator(); iterb.hasNext(); ) {
 256  0
             String targetClassId = ( String ) iterb.next();
 257   
 
 258   
             /// skip self comparisons and also symmetric comparisons.
 259  0
             if ( seenit.contains( targetClassId )
 260   
                   || targetClassId.equals( queryClassId ) ) {
 261  0
                continue;
 262   
             }
 263   
 
 264  0
             Set targetClass = ( Set ) ga.getGeneSetToGeneMap().get(
 265   
                   targetClassId );
 266   
 
 267  0
             int targetSize = targetClass.size();
 268  0
             if ( targetSize < querySize || targetSize > maxClassSize
 269   
                   || targetSize < minClassSize ) {
 270  0
                continue;
 271   
             }
 272   
 
 273  0
             double sizeScore;
 274   
 
 275  0
             if ( areSimilarClasses( targetClass, queryClass,
 276   
                   fractionSameThreshold, bigClassPenalty ) ) {
 277   
 
 278  0
                sizeScore = ( ( double ) targetClass.size() / ( double ) queryClass
 279   
                      .size() )
 280   
                      / bigClassPenalty;
 281   
 
 282  0
                if ( sizeScore < 1.0 ) { // delete the larget class.
 283  0
                   deleteUs.add( targetClassId );
 284  0
                   seenit.add( targetClassId );
 285   
                } else {
 286  0
                   deleteUs.add( queryClassId );
 287  0
                   seenit.add( queryClassId );
 288  0
                   break; // query is no longer relevant, go to the next one.
 289   
                }
 290   
 
 291  0
                storeSimilarSets( classesToSimilarMap, queryClassId,
 292   
                      targetClassId );
 293   
             }
 294   
 
 295   
          } /* inner while */
 296   
       }
 297   
       /* end while ... */
 298   
 
 299   
       /* remove the ones we don't want to keep */
 300  0
       Iterator itrd = deleteUs.iterator();
 301  0
       while ( itrd.hasNext() ) {
 302  0
          String deleteMe = ( String ) itrd.next();
 303  0
          ga.removeClassFromMaps( deleteMe );
 304   
       }
 305   
 
 306  0
       ga.resetSelectedSets();
 307  0
       ga.sortGeneSets();
 308   
 
 309  0
       if ( messenger != null ) {
 310  0
          messenger.setStatus( "There are now " + ga.numGeneSets()
 311   
                + " classes represented on the chip (" + deleteUs.size()
 312   
                + " were ignored)" );
 313   
       }
 314   
    }
 315   
 
 316   
    /* ignoreSimilar */
 317   
 
 318   
    /**
 319   
     * @param classesToSimilarMap
 320   
     * @param queryClassId
 321   
     * @param targetClassId
 322   
     */
 323  0
    private static void storeSimilarSets( Map classesToSimilarMap,
 324   
          String queryClassId, String targetClassId ) {
 325  0
       if ( !classesToSimilarMap.containsKey( targetClassId ) ) {
 326  0
          classesToSimilarMap.put( targetClassId, new HashSet() );
 327   
       }
 328  0
       if ( !classesToSimilarMap.containsKey( queryClassId ) ) {
 329  0
          classesToSimilarMap.put( queryClassId, new HashSet() );
 330   
 
 331   
       }
 332  0
       ( ( HashSet ) classesToSimilarMap.get( queryClassId ) )
 333   
             .add( targetClassId );
 334  0
       ( ( HashSet ) classesToSimilarMap.get( targetClassId ) )
 335   
             .add( queryClassId );
 336   
    }
 337   
 
 338   
    /**
 339   
     * Helper function for ignoreSimilar.
 340   
     */
 341  0
    private static boolean areSimilarClasses( Set biggerClass, Set smallerClass,
 342   
          double fractionSameThreshold, double bigClassPenalty ) {
 343   
 
 344  0
       if ( biggerClass.size() < smallerClass.size() ) {
 345  0
          throw new IllegalArgumentException( "Invalid sizes" );
 346   
       }
 347   
 
 348   
       /*
 349   
        * Threshold of how many items from the smaller class must NOT be in the bigger class, before we consider the
 350   
        * classes different.
 351   
        */
 352  0
       int notInThresh = ( int ) Math.ceil( fractionSameThreshold
 353   
             * smallerClass.size() );
 354   
 
 355  0
       int notin = 0;
 356   
 
 357  0
       int overlap = 0;
 358  0
       for ( Iterator iter = smallerClass.iterator(); iter.hasNext(); ) {
 359   
 
 360  0
          String gene = ( String ) iter.next();
 361  0
          if ( !biggerClass.contains( gene ) ) {
 362  0
             notin++;
 363   
          } else {
 364  0
             overlap++;
 365   
          }
 366  0
          if ( notin > notInThresh ) {
 367   
             // return false;
 368   
          }
 369   
       }
 370   
 
 371  0
       if ( ( double ) overlap / ( double ) smallerClass.size() > fractionSameThreshold ) {
 372   
          //         System.err.println( "Small class of size " + smallerClass.size()
 373   
          //               + " too much contained (overlap = " + overlap
 374   
          //               + ") in large class of size " + biggerClass.size() );
 375  0
          return true;
 376   
       }
 377   
 
 378   
       /* return true is the count is high enough */
 379   
       //   return true;
 380  0
       return false;
 381   
 
 382   
    }
 383   
 
 384   
    /**
 385   
     * Identify classes which are absoluely identical to others. This isn't superfast, because it doesn't know which
 386   
     * classes are actually relevant in the data.
 387   
     */
 388  0
    public static void collapseGeneSets( GeneAnnotations geneData,
 389   
          StatusViewer messenger ) {
 390  0
       Map setToGeneMap = geneData.getGeneSetToGeneMap();
 391  0
       Map classesToRedundantMap = geneData.geneSetToRedundantMap();
 392  0
       LinkedHashMap seenClasses = new LinkedHashMap();
 393  0
       LinkedHashMap sigs = new LinkedHashMap();
 394   
 
 395  0
       HashMap seenit = new HashMap();
 396   
 
 397  0
       if ( messenger != null ) {
 398  0
          messenger
 399   
                .setStatus( "There are "
 400   
                      + geneData.numGeneSets()
 401   
                      + " classes represented on the chip (of any size). Redundant classes are being removed..." );
 402   
       }
 403   
 
 404   
       // sort each arraylist in for each go and create a string that is a signature for this class.
 405  0
       int ignored = 0;
 406  0
       for ( Iterator iter = setToGeneMap.keySet().iterator(); iter.hasNext(); ) {
 407  0
          String classId = ( String ) iter.next();
 408  0
          Set classMembers = ( Set ) setToGeneMap.get( classId );
 409   
 
 410  0
          if ( classMembers.contains( null ) ) {
 411  0
             classMembers.remove( null ); // FIXME why do we need to do this?
 412   
             //          throw new IllegalStateException(classId + " contains null.");
 413   
          }
 414   
 
 415   
          // @todo - hack : Skip classes that are huge. It's too slow
 416   
          // otherwise. This is a total heuristic. Note that this
 417   
          // doesn't mean the class won't get analyzed, it just
 418   
          // means we don't bother looking for redundancies. Big
 419   
          // classes are less likely to be identical to others,
 420   
          // anyway. In tests, the range shown below has no effect
 421   
          // on the results, but it _could_ matter.
 422  0
          if ( classMembers == null || classMembers.size() > 250
 423   
                || classMembers.size() < 2 ) {
 424  0
             continue;
 425   
          }
 426   
 
 427  0
          Vector cls = new Vector( classMembers );
 428   
 
 429  0
          if ( cls == null ) continue;
 430   
 
 431  0
          Collections.sort( cls );
 432  0
          String signature = "";
 433  0
          seenit.clear();
 434  0
          Iterator classit = cls.iterator();
 435  0
          while ( classit.hasNext() ) {
 436  0
             String probeid = ( String ) classit.next();
 437  0
             if ( !seenit.containsKey( probeid ) ) {
 438  0
                signature = signature + "__" + probeid;
 439  0
                seenit.put( probeid, new Boolean( true ) );
 440   
             }
 441   
          }
 442  0
          sigs.put( classId, signature );
 443   
       }
 444   
 
 445   
       // look at the signatures for repeats.
 446  0
       for ( Iterator iter = sigs.keySet().iterator(); iter.hasNext(); ) {
 447  0
          String classId = ( String ) iter.next();
 448  0
          String signature = ( String ) sigs.get( classId );
 449   
 
 450   
          // if the signature has already been seen, add it to the redundant
 451   
          // list, and remove this class from the classToProbeMap.
 452  0
          if ( seenClasses.containsKey( signature ) ) {
 453  0
             if ( !classesToRedundantMap.containsKey( seenClasses
 454   
                   .get( signature ) ) ) {
 455  0
                classesToRedundantMap.put( seenClasses.get( signature ),
 456   
                      new ArrayList() );
 457   
 
 458   
             }
 459  0
             ( ( ArrayList ) classesToRedundantMap.get( seenClasses
 460   
                   .get( signature ) ) ).add( classId );
 461  0
             ignored++;
 462  0
             geneData.removeClassFromMaps( classId );
 463   
             //        System.err.println(classId + " is the same as an existing class, " + seenClasses.get(signature));
 464   
          } else {
 465   
             // add string to hash
 466  0
             seenClasses.put( signature, classId );
 467   
          }
 468   
       }
 469   
 
 470  0
       geneData.resetSelectedSets();
 471  0
       geneData.sortGeneSets();
 472   
 
 473  0
       if ( messenger != null ) {
 474  0
          messenger.setStatus( "There are now " + geneData.numGeneSets()
 475   
                + " classes represented on the chip (" + ignored
 476   
                + " were removed)" );
 477   
       }
 478   
    }
 479   
 
 480   
    /**
 481   
     * @param classId
 482   
     * @param classesToRedundantMap
 483   
     * @return
 484   
     */
 485  0
    public static ArrayList getRedundancies( String classId,
 486   
          Map classesToRedundantMap ) {
 487  0
       if ( classesToRedundantMap != null
 488   
             && classesToRedundantMap.containsKey( classId ) ) {
 489  0
          return ( ArrayList ) classesToRedundantMap.get( classId );
 490   
       }
 491  0
       return null;
 492   
 
 493   
    }
 494   
 
 495   
    /**
 496   
     * @param classId
 497   
     * @param classesToSimilarMap
 498   
     * @return
 499   
     */
 500  0
    public static ArrayList getSimilarities( String classId,
 501   
          Map classesToSimilarMap ) {
 502  0
       if ( classesToSimilarMap != null
 503   
             && classesToSimilarMap.containsKey( classId ) ) {
 504  0
          return ( ArrayList ) classesToSimilarMap.get( classId );
 505   
       }
 506  0
       return null;
 507   
    }
 508   
 
 509   
    /**
 510   
     * @param classId
 511   
     * @param classesToRedundantMap
 512   
     * @return
 513   
     */
 514  0
    public String getRedundanciesString( String classId,
 515   
          Map classesToRedundantMap ) {
 516  0
       if ( classesToRedundantMap != null
 517   
             && classesToRedundantMap.containsKey( classId ) ) {
 518  0
          ArrayList redundant = ( ArrayList ) classesToRedundantMap
 519   
                .get( classId );
 520  0
          Iterator it = redundant.iterator();
 521  0
          String returnValue = "";
 522  0
          while ( it.hasNext() ) {
 523  0
             returnValue = returnValue + ", " + it.next();
 524   
          }
 525  0
          return returnValue;
 526   
       }
 527  0
       return "";
 528   
    }
 529   
 
 530   
 } // end of class
 531