Recently, I have been working on measuring the semantic distance between concepts in ontologies. One such tool is SimMetrics at
http://www.dcs.shef.ac.uk/~sam/stringmetrics.html which is an open source extensible library in Java. The following string metrics are included in the package and can be found here at
http://www.dcs.shef.ac.uk/~sam/stringmetrics.html with a corresponding description. These similarity measures can be found in statisics, DNA analysis, artificial intelligence, and information retrieval.
For example, the cosine similarity is a vector space measure:
/**
* SimMetrics - SimMetrics is a java library of Similarity or Distance
* Metrics, e.g. Levenshtein Distance, that provide float based similarity
* measures between String Data. All metrics return consistant measures
* rather than unbounded similarity scores.
*
* Copyright (C) 2005 Sam Chapman - Open Source Release v1.1
*
* Please Feel free to contact me about this library, I would appreciate
* knowing quickly what you wish to use it for and any criticisms/comments
* upon the SimMetric library.
*
* email: s.chapman@dcs.shef.ac.uk
* www: http://www.dcs.shef.ac.uk/~sam/
* www: http://www.dcs.shef.ac.uk/~sam/stringmetrics.html
*
* address: Sam Chapman,
* Department of Computer Science,
* University of Sheffield,
* Sheffield,
* S. Yorks,
* S1 4DP
* United Kingdom,
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at your
* option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package uk.ac.shef.wit.simmetrics.similaritymetrics;
import uk.ac.shef.wit.simmetrics.tokenisers.InterfaceTokeniser;
import uk.ac.shef.wit.simmetrics.tokenisers.TokeniserWhitespace;
import java.util.HashSet;
import java.util.Set;
import java.util.ArrayList;
import java.io.Serializable;
/**
* Package: uk.ac.shef.wit.simmetrics.similaritymetrics.cosinesimilarity
* Description: uk.ac.shef.wit.simmetrics.similaritymetrics.cosinesimilarity implements a
* Date: 05-Apr-2004
* Time: 10:52:58
* @author Sam Chapman
Website,
Email.
* @version 1.1
*/
public final class CosineSimilarity extends AbstractStringMetric implements Serializable {
/**
* a constant for calculating the estimated timing cost.
*/
private final float ESTIMATEDTIMINGCONST = 0.00000038337142857142857142857142857142f;
/**
* private tokeniser for tokenisation of the query strings.
*/
private final InterfaceTokeniser tokeniser;
/**
* constructor - default (empty).
*/
public CosineSimilarity() {
tokeniser = new TokeniserWhitespace();
}
/**
* constructor.
*
* @param tokeniserToUse - the tokeniser to use should a different tokeniser be required
*/
public CosineSimilarity(final InterfaceTokeniser tokeniserToUse) {
tokeniser = tokeniserToUse;
}
/**
* returns the string identifier for the metric.
*
* @return the string identifier for the metric
*/
public String getShortDescriptionString() {
return "CosineSimilarity";
}
/**
* returns the long string identifier for the metric.
*
* @return the long string identifier for the metric
*/
public String getLongDescriptionString() {
return "Implements the Cosine Similarity algorithm providing a similarity measure between two strings from the angular divergence within term based vector space";
}
/**
* gets a div class xhtml similarity explaining the operation of the metric.
*
* @param string1 string 1
* @param string2 string 2
*
* @return a div class html section detailing the metric operation.
*/
public String getSimilarityExplained(String string1, String string2) {
//todo this should explain the operation of a given comparison
return null; //To change body of implemented methods use File Settings File Templates.
}
/**
* gets the estimated time in milliseconds it takes to perform a similarity timing.
*
* @param string1 string 1
* @param string2 string 2
*
* @return the estimated time in milliseconds taken to perform the similarity measure
*/
public float getSimilarityTimingEstimated(final String string1, final String string2) {
//timed millisecond times with string lengths from 1 + 50 each increment
//0 0.02 0.03 0.05 0.08 0.11 0.14 0.18 0.23 0.27 0.33 0.39 0.46 0.52 0.6 0.65 0.76 0.84 0.94 1.01 1.13 1.23 1.49 1.45 1.95 1.67 2.26 1.93 2.6 2.26 2.86 2.54 3.17 2.91 3.76 3.17 3.9 3.5 4.32 3.9 5.1 4.32 5.64 4.83 5.64 5.07 6.34 5.64 7.03 5.97 7.81 6.55 8.12 7 9.23 7.52 9.71 8.12 10.68 8.46
final float str1Length = string1.length();
final float str2Length = string2.length();
return (str1Length + str2Length) * ((str1Length + str2Length) * ESTIMATEDTIMINGCONST);
}
/**
* gets the similarity of the two strings using CosineSimilarity.
*
* @param string1
* @param string2
* @return a value between 0-1 of the similarity
*/
public float getSimilarity(final String string1, final String string2) {
final ArrayList
str1Tokens = tokeniser.tokenizeToArrayList(string1);
final ArrayList str2Tokens = tokeniser.tokenizeToArrayList(string2);
final Set allTokens = new HashSet();
allTokens.addAll(str1Tokens);
final int termsInString1 = allTokens.size();
final Set secondStringTokens = new HashSet();
secondStringTokens.addAll(str2Tokens);
final int termsInString2 = secondStringTokens.size();
//now combine the sets
allTokens.addAll(secondStringTokens);
final int commonTerms = (termsInString1 + termsInString2) - allTokens.size();
//return CosineSimilarity
return (float) (commonTerms) / (float) (Math.pow((float) termsInString1, 0.5f) * Math.pow((float) termsInString2, 0.5f));
}
/**
* gets the un-normalised similarity measure of the metric for the given strings.
*
* @param string1
* @param string2
* @return returns the score of the similarity measure (un-normalised)
*/
public float getUnNormalisedSimilarity(String string1, String string2) {
return getSimilarity(string1, string2);
}
}
For some insight in using the toolkit, consider the paper by Cohen, et. al. from Carnegie Mellon University at http://www.isi.edu/info-agents/workshops/ijcai03/papers/Cohen-p.pdf provides an overview of similarity measures in matching entity names. In order to look at some the metrics in the paper that are not included in SimMetrics, I downloaded their jar of metrics at http://sourceforge.net/projects/secondstring/ and looked at the JavaDocs at http://secondstring.sourceforge.net/javadoc/. Thus, between these two jars, secondstring and SimMetrics, there are plenty of opportunities to build extensible libraries for developing and using semantic distance metrics.