/*
 * MiningMart Version 1.1
 * 
 * Copyright (C) 2006 Martin Scholz, Timm Euler, 
 *                    Daniel Hakenjos, Katharina Morik
 *
 * Contact: miningmart@ls8.cs.uni-dortmund.de
 *
 * A list of contributing developers (other than the copyright 
 * holders) can be found at
 * http://mmart.cs.uni-dortmund.de/downloads/download.html
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program, see the file MM_HOME/LICENSE; if not, write
 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301, USA.
 */
package edu.udo.cs.miningmart.schemamatching;

import java.util.Collection;
import java.util.Vector;

import edu.udo.cs.miningmart.m4.Value;

/**
 *
 * A simple matcher based on a comparison of all possible pairs of 
 * n-grams of two strings. The value of n is set in the constructor.
 * 
 * @author Timm Euler
 * @version $Id: NgramMatcher.java,v 1.5 2006/09/27 15:00:03 euler Exp $
 */
public class NgramMatcher extends NameAndTypeBasedSchemaMatcher {

	private int n = 3; // the length of an n-gram
	
	/**
	 * Constructor. Allows to set the length of an n-gram.
	 * 
	 * @param n the length of an n-gram
	 */
	public NgramMatcher(int n) {
		this.n = n;
	}
	
	/**
	 * @see edu.udo.cs.miningmart.schemamatching.NameAndTypeBasedSchemaMatcher#getNameSimilarity(String, String)
	 */
	public double getNameSimilarity(String str1, String str2) throws SchemaMatchException {
		if (str1 == null || str2 == null) {
			return 0d;
		}
		
		// if both string are shorter than n, compare them directly 
		// in the same way as n-grams are compared:
		if (str1.length() <= this.n && str2.length() <= this.n) {
			return this.getStringSimilarity(str1, str2);
		}
		
		// otherwise use the matrix-based method.
		// Go through both Strings and collect all n-grams:
		Collection<Value> firstNGrams = new Vector<Value>();
		Collection<Value> secondNGrams = new Vector<Value>();
		int numberOfActualMatches = 0;
		for (int i = 0; i < str1.length(); i++) {			
			int maxIndex = Math.min(i + this.n, str1.length());
			String first = str1.substring(i, maxIndex);
			// bit of a hack - that constructor has been added to
			// use value objects outside the M4 interface...:
			Value v = new edu.udo.cs.miningmart.m4.core.Value();
			v.setValue(first);
			firstNGrams.add(v);
		}
		for (int j = 0; j < str2.length(); j++) {
			int maxIndex = Math.min(j + this.n, str2.length());
			String second = str2.substring(j, maxIndex);
			// bit of a hack - that constructor has been added to
			// use value objects outside the M4 interface...:
			Value v = new edu.udo.cs.miningmart.m4.core.Value();
			v.setValue(second);
			secondNGrams.add(v);
		}
		
		MatchingResult[][] matrix = this.getSimilarityMatrix(firstNGrams, secondNGrams);
		Collection<MatchingResult<Value>> maps = this.getSimilarMatchingsGreedy(matrix, true);
		return this.getGlobalSimilarity(firstNGrams, secondNGrams, maps);
	}	

	/**
	 * This method is used to compare the "atoms", ie the n-grams 
	 * (Strings of length n); see MmSchemaMatcher.getSimilarityMatrix(Collection, Collection)
	 */
	public double getStringSimilarity(String ngram1, String ngram2) {
		String n1 = ngram1.toLowerCase();
		String n2 = ngram2.toLowerCase();
		if (n1.length() < n2.length() && n2.startsWith(n1)) {
			return 0.75d;
		}
		if (n1.length() > n2.length() && n1.startsWith(n2)) {
			return 0.75d;
		}
		if (n1.length() == n2.length() && n2.equals(n1)) {
			return 1d;
		}		
		return 0d;
	}
}
/*
 * $Log: NgramMatcher.java,v $
 * Revision 1.5  2006/09/27 15:00:03  euler
 * New version 1.1
 *
 * Revision 1.4  2006/04/11 14:10:17  euler
 * Updated license text.
 *
 * Revision 1.3  2006/04/06 16:31:16  euler
 * Prepended license remark.
 *
 * Revision 1.2  2006/01/03 16:19:09  euler
 * Bugfixes
 *
 * Revision 1.1  2006/01/03 14:47:52  euler
 * New NgramMatcher, additional comments.
 *
 */
