/*
 * MiningMart Version 1.0
 * 
 * Copyright (C) 2006 Martin Scholz, Timm Euler, 
 *                    Daniel Hakenjos, Katharina Morik
 *
 * Contact: miningmart@ls8.cs.uni-dortmund.de
 *
 * A list of contributing developers (other than the copyright 
 * holders) can be found at
 * http://mmart.cs.uni-dortmund.de/downloads/download.html
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program, see the file MM_HOME/LICENSE; if not, write
 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301, USA.
 */
package edu.udo.cs.miningmart.schemamatching;

/**
 *
 * A simple matcher based on a comparison of all possible pairs of 
 * n-grams of two strings. The value of n is set in the constructor.
 * 
 * @author Timm Euler
 * @version $Id: NgramMatcher.java,v 1.4 2006/04/11 14:10:17 euler Exp $
 */
public class NgramMatcher extends StringBasedSchemaMatcher {

	private int n = 3; // the length of an n-gram
	
	/**
	 * Constructor. Allows to set the length of an n-gram.
	 * 
	 * @param n the length of an n-gram
	 */
	public NgramMatcher(int n) {
		this.n = n;
	}
	
	/**
	 * @see edu.udo.cs.miningmart.schemamatching.StringBasedSchemaMatcher#getStringSimilarity(String, String)
	 */
	public double getStringSimilarity(String str1, String str2) throws SchemaMatchException {
		if (str1 == null || str2 == null) {
			return 0d;
		}
		
		// if both string are shorter than n, compare them directly 
		// in the same way as n-grams are compared:
		if (str1.length() <= this.n && str2.length() <= this.n) {
			return this.getNgramSimilarity(str1, str2);
		}
		
		// find out how many n-grams can possibly match:
		String longestString = (str1.length() > str2.length() ? str1 : str2);
		int numberOfPossibleMatches = longestString.length() - this.n + 1;
		
		// go through both Strings and compare all n-grams:
		int numberOfActualMatches = 0;
		for (int i = 0; i < str1.length(); i++) {			
			int maxIndex = Math.min(i + this.n, str1.length());
			String first = str1.substring(i, maxIndex);
			for (int j = 0; j < str2.length(); j++) {
				maxIndex = Math.min(j + this.n, str2.length());
				String second = str2.substring(j, maxIndex);
				if (this.getNgramSimilarity(first, second) > 0) {
					numberOfActualMatches++;
				}			
			}
		}
		// let's not count doubles:
		if (numberOfActualMatches > numberOfPossibleMatches) {
			return 1d;
		}
		
		return ((double) numberOfActualMatches) / ((double) numberOfPossibleMatches);
	}	

	// This method is used to compare the "atoms", ie the n-grams (Strings of length n).
	private double getNgramSimilarity(String ngram1, String ngram2) {
		String n1 = ngram1.toLowerCase();
		String n2 = ngram2.toLowerCase();
		if (n1.length() < n2.length() && n2.startsWith(n1)) {
			return 0.75d;
		}
		if (n1.length() > n2.length() && n1.startsWith(n2)) {
			return 0.75d;
		}
		if (n1.length() == n2.length() && n2.equals(n1)) {
			return 1d;
		}		
		return 0d;
	}
}
/*
 * $Log: NgramMatcher.java,v $
 * Revision 1.4  2006/04/11 14:10:17  euler
 * Updated license text.
 *
 * Revision 1.3  2006/04/06 16:31:16  euler
 * Prepended license remark.
 *
 * Revision 1.2  2006/01/03 16:19:09  euler
 * Bugfixes
 *
 * Revision 1.1  2006/01/03 14:47:52  euler
 * New NgramMatcher, additional comments.
 *
 */
