package com.rapidminer.operator.validation;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeWeights;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.AttributeWeightedExampleSet;
import com.rapidminer.example.set.SplittedExampleSet;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ValueDouble;
import com.rapidminer.operator.ValueString;
import com.rapidminer.operator.condition.CombinedInnerOperatorCondition;
import com.rapidminer.operator.condition.InnerOperatorCondition;
import com.rapidminer.operator.condition.SpecificInnerOperatorCondition;
import com.rapidminer.operator.performance.PerformanceCriterion;
import com.rapidminer.operator.performance.PerformanceVector;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.math.AverageVector;


/**
 * 
 * <p>
 * This operator evaluates the performance of a feature selection
 * algorithm. The evaluation is done in an unusual way: Normally, 
 * we would evaluate the performance on unseen examples of the  
 * classes used during feature selection. Here, we evaluate the performance 
 * on unseen examples belonging 
 * to previously unseen classes.</p>
 * <p> 
 * Consider the following example: We have log data of applications
 * consisting of system calls (features). We want to predict the name
 * of the application by looking at the log data. We also want to apply feature 
 * selection, possibly eliminating irrelevant system calls.
 * If the applications are fixed, we do the usual feature selection,
 * determining which features to use on a sample containing the same 
 * classes (applications) that are in the test set.
 * However, we might want to know which features (system calls) are generally
 * more relevant than others. This would allow us to make recommendations
 * which features to use for learning the names of arbitrary, previously
 * unseen applications. Obviously, thats a bit more difficult.  
 * </p>
 * <p>
 * The first inner operator is the feature selection algorithm to be evaluated.
 * Its input is an example set containing examples belonging to a randomly
 * selected sample of classes. 
 * It must return an attribute weights vector which is applied to the test data.
 * </p>
 * <p>
 * The test data is an example set containing examples not belonging to the classes 
 * selected for the first inner operator. 
 * The second inner operator takes this test data and produces a performance vector.
 * </p>
 * <p>
 * There are to ways to evaluate this performance. If the checkbox "auto_validation"
 * is selected, the second operator is also run on a set of unseen examples 
 * of the known classes (as in plain attribute selection),
 * giving an estimate of the penalty incurred by selecting the attributes on the 
 * wrong set of classes. The set of unseen examples of the known classes is produced 
 * by splitting the set of examples of the known classes in two before selecting the attributes,
 * so attribute selection is done on a set half the original size when this checkbox is selected.
 * If the checkbox "all_attributes" is selected, the second operator is also run on the
 * set of unseen examples with all attributes switched on.
 * The second operator will thus be run up to three times for each subset of classes.
 * It is guaranteed that these three runs are performed on example sets of equal
 * size, so that none of the inner learning algorithms has an unfair advantage and
 * the results are comparable.</p><p>
 * The operator also provides extensive logging facilities.
 *  
 * 
 * @author Peter Fricke, Ingo Mierswa (WrapperXValidation)
 *          Exp $
 */
public class UnseenClassValidation extends OperatorChain {

	/** The parameter name for &quot;Number of subsets for the crossvalidation&quot; */
	public static final String PARAMETER_NUMBER_OF_VALIDATIONS = "number_of_validations";

	/** The parameter name for &quot;Use the given random seed instead of global random numbers (-1: use global)&quot; */
	public static final String PARAMETER_LOCAL_RANDOM_SEED = "local_random_seed";

	/** The parameter name for &quot;Evaluate performance also for the classes used for attribute selection.&quot; */
	public static final String PARAMETER_AUTO_VALIDATION = "auto_validation";

	/** The parameter name for &quot;Also evaluate performance with all attributes switched on.&quot; */
	public static final String PARAMETER_ALL_ATTRIBUTES = "all_attributes";
	
	private static final Class[] OUTPUT_CLASSES = { PerformanceVector.class, AttributeWeights.class };

	private static final Class[] INPUT_CLASSES = { ExampleSet.class };

	private PerformanceCriterion lastPerformance;
	private PerformanceCriterion lastAutoPerformance;
	private PerformanceCriterion lastAllAttributesPerformance;
	
	
	private IOContainer methodResult;

	/** Total number of iterations. */
	private int number;

	/** Current iteration. */
	private int iteration;
	
	private int sizeUnseen = 0;
	private int sizeAuto = 0;
	private int sizeAll = 0;
	private int sizeFeatures = 0;
	
	private boolean inAuto = false;
	private boolean inAllAttributes = false;
	
	private String subset = "";
	private String features = "";
	
	
	public UnseenClassValidation(OperatorDescription description) {
		super(description);
		addValue(new ValueDouble("performance", "The last performance (main criterion) when evaluating on the unseen classes.") {
			public double getDoubleValue() {
				if (lastPerformance != null)
					return lastPerformance.getAverage();
				else
					return Double.NaN;
			}
		});														
		addValue(new ValueDouble("autoPerformance", "The last performance (main criterion) when evaluating on the classes used to learn the attribute weights.") {
			public double getDoubleValue() {
				if (lastAutoPerformance != null)
					return lastAutoPerformance.getAverage();
				else
					return Double.NaN;
			}
		});
		addValue(new ValueDouble("allAttributesPerformance", "The last performance (main criterion) when evaluating with all attributes switched on.") {
			public double getDoubleValue() {
				if (lastAllAttributesPerformance != null)
					return lastAllAttributesPerformance.getAverage();
				else
					return Double.NaN;
			}
		});
		addValue(new ValueDouble("variance", "The variance of the last performance (main criterion).") {
			public double getDoubleValue() {
				if (lastPerformance != null)
					return lastPerformance.getVariance();
				else
					return Double.NaN;
			}
		});
		addValue(new ValueDouble("iteration", "The number of the current iteration.") {
			public double getDoubleValue() {
				return iteration;
			}
		});
		addValue(new ValueDouble("sizeFeatures", "The number of selected features.") {
			public double getDoubleValue() {
				return new Double( sizeFeatures );
			}
		});
		addValue(new ValueDouble("sizeAuto", "The size of the set used for evaluation on the classes used to learn the attribute weights.") {
			public double getDoubleValue() {
				return new Double( sizeAuto );
			}
		});
		addValue(new ValueDouble("sizeAllAttributes", "The size of the set used for evaluation with all atributes switched on.") {
			public double getDoubleValue() {
				return new Double( sizeAll );
			}
		});
		addValue(new ValueDouble("sizeUnseen", "The size of the set used for evaluation on the unseen classes.") {
			public double getDoubleValue() {
				return new Double( sizeUnseen );
			}
		});
		addValue(new ValueString("inAuto", "True if currently performing evaluation on the classes used to learn the attribute weights.") {
			public String getStringValue() {
				if( inAuto ) return "true";
				else return "false";
			}
		});
		addValue(new ValueString("inAllAttributes", "True if currently performing evaluation with all attributes switched on.") {
			public String getStringValue() {
				if( inAllAttributes ) return "true";
				else return "false";
			}
		});
		addValue(new ValueString("subset", "Subset of classes used to learn the attribute weights.") {
			public String getStringValue() {
				return subset;
			}
		});
		addValue(new ValueString("features", "Names of the selected attributes.") {
			public String getStringValue() {
				return features;
			}
		});


		
	}

	/** Returns the maximum number of inner operators. */
	public int getMaxNumberOfInnerOperators() {
		return 2;
	}

	/** Returns the minimum number of inner operators. */
	public int getMinNumberOfInnerOperators() {
		return 2;
	}

	public Class<?>[] getOutputClasses() {
		return OUTPUT_CLASSES;
	}

	public Class<?>[] getInputClasses() {
		return INPUT_CLASSES;
	}

	public InnerOperatorCondition getInnerOperatorCondition() {
		CombinedInnerOperatorCondition condition = new CombinedInnerOperatorCondition();
		condition.addCondition(new SpecificInnerOperatorCondition("FeatureSelection", 0, 
				new Class[] { ExampleSet.class }, new Class[] { AttributeWeights.class }));
		condition.addCondition(new SpecificInnerOperatorCondition("Evaluation", 1, 
				new Class[] { ExampleSet.class }, new Class[] { PerformanceVector.class }));
		return condition;
	}

	
	public IOObject[] apply() throws OperatorException {
		ExampleSet eSet = getInput(ExampleSet.class);
		int eSetSize = eSet.size();
		
		number = getParameterAsInt( PARAMETER_NUMBER_OF_VALIDATIONS );
        int randomSeed = getParameterAsInt( PARAMETER_LOCAL_RANDOM_SEED );
        RandomGenerator random = null;
        if( randomSeed != -1 )
    	   random = RandomGenerator.getRandomGenerator( true, randomSeed );
        else
           random = RandomGenerator.getGlobalRandomGenerator();
        
		Attribute label = eSet.getAttributes().getLabel();
		SplittedExampleSet inputSet = SplittedExampleSet.splitByAttribute(eSet, label);
		int nrOfClasses = inputSet.getNumberOfSubsets();
		int size = 0;
			
		List<AverageVector> averageVectors = new LinkedList<AverageVector>();
		List<AverageVector> autoVectors = new LinkedList<AverageVector>();
		List<AverageVector> allAttributesVectors = new LinkedList<AverageVector>();
		
		//PerformanceVector performanceVector = null;
//		PerformanceVector autoPerformanceVector = null;
//		PerformanceVector allAttributesPerformanceVector = null;
		AttributeWeights globalWeights = new AttributeWeights();
		for (Attribute attribute : eSet.getAttributes()) {
			globalWeights.setWeight(attribute.getName(), 0.0d);
		}	
		
		for (iteration = 0; iteration < number; iteration++) {
 
			// select classes for feature selection
			inputSet.clearSelection();			
			int[] randomSubset = getRandomSubset( nrOfClasses, random );
			for( int i = 0; i < randomSubset.length; i++ ){
				inputSet.selectAdditionalSubset( randomSubset[i] );
			}
			subset = "\"" + Arrays.toString(randomSubset) + "\"";
			int subsetSize = inputSet.size();
			int unseenSize = eSetSize - subsetSize;
			
			// prepare auto evaluation
			SplittedExampleSet firstExampleSet = null;			
			if( getParameterAsBoolean(PARAMETER_AUTO_VALIDATION) ){				
				size = (int)Math.floor( Math.min( unseenSize, subsetSize/2.0 ) );	
				double ratio = 1.0 - size / (double) subsetSize; 
				firstExampleSet = new SplittedExampleSet( inputSet, ratio, SplittedExampleSet.SHUFFLED_SAMPLING, true, randomSeed );
				firstExampleSet.selectSingleSubset(0);
			}
			else{
				firstExampleSet = (SplittedExampleSet) inputSet.clone();
			}
							
			// apply method
			AttributeWeights weights = useMethod(firstExampleSet).remove(AttributeWeights.class);
			methodResult.remove(ExampleSet.class);
			sizeFeatures = countSelectedAttributes( weights );
			features = "\"" + getSelectedAttributes( weights ) + "\"";			
			
			// evaluation for classes used to find the selection
			if( getParameterAsBoolean(PARAMETER_AUTO_VALIDATION) ){
				inAuto = true;
				firstExampleSet.selectSingleSubset(1);
				sizeAuto = firstExampleSet.size();
				sizeUnseen = 0;
				sizeAll = 0;
				
				AttributeWeightedExampleSet autoWeightedSet = new AttributeWeightedExampleSet(firstExampleSet, weights, 0.0d).createCleanClone();
				IOContainer autoEvalOutput = getEvaluator().apply(methodResult.append(new IOObject[] { autoWeightedSet }));
				
				// retrieve performance and build performance averages							
				PerformanceVector autoIterationPerformance = autoEvalOutput.get(PerformanceVector.class);
				Tools.handleAverages(autoEvalOutput, autoVectors, true);
				
				setAutoResult(autoIterationPerformance.getMainCriterion());
				inAuto = false;
			}			
			
			// evaluation for new classes
			inputSet.invertSelection();
			SplittedExampleSet newInputSet;
			
			//if we are doing auto evaluation, we want to compare the performance on sets of equal size
			if( getParameterAsBoolean(PARAMETER_AUTO_VALIDATION) ){
				double ratio = size / (double) inputSet.size();
				newInputSet = new SplittedExampleSet( inputSet, ratio, SplittedExampleSet.SHUFFLED_SAMPLING, true, randomSeed );
				newInputSet.selectSingleSubset(0);
			}
			else{
				newInputSet = (SplittedExampleSet) inputSet.clone();						
			}
									
			if( getParameterAsBoolean(PARAMETER_ALL_ATTRIBUTES) ){
				inAllAttributes = true;
				sizeAll = newInputSet.size();
				sizeUnseen = 0;

				IOContainer attribOutput = getEvaluator().apply(methodResult.append(new IOObject[] { (ExampleSet) newInputSet.clone() }));

				// retrieve performance and build performance averages							
				PerformanceVector allAttributesIterationPerformance = attribOutput.get(PerformanceVector.class);				
				Tools.handleAverages(attribOutput, allAttributesVectors, true);

				setAllAttributesResult(allAttributesIterationPerformance.getMainCriterion());
				inAllAttributes = false;
			}

			sizeUnseen = newInputSet.size();
			AttributeWeightedExampleSet weightedSet = new AttributeWeightedExampleSet(newInputSet, weights, 0.0d).createCleanClone();
			IOContainer evalOutput = evaluate( weightedSet );

			// retrieve performance and build performance averages			
			PerformanceVector iterationPerformance = evalOutput.get(PerformanceVector.class);
			Tools.handleAverages(evalOutput, averageVectors, true);
			
			// build weights average
			handleWeights(globalWeights, weights);

			setResult(iterationPerformance.getMainCriterion());
			inApplyLoop();
		}
		// end of cross validation

		// build average of weights
		Iterator i = globalWeights.getAttributeNames().iterator();
		while (i.hasNext()) {
			String currentName = (String) i.next();
			globalWeights.setWeight(currentName, globalWeights.getWeight(currentName) / number);
		}

		PerformanceVector averagePerformance = Tools.getPerformanceVector(averageVectors);
		setResult(averagePerformance.getMainCriterion());
		
		PerformanceVector autoPerformanceVector = Tools.getPerformanceVector(autoVectors);		
		if( autoPerformanceVector != null ) {
			setAutoResult( autoPerformanceVector.getMainCriterion() );
			System.out.println(autoPerformanceVector);
		}
		PerformanceVector allAttributesPerformanceVector = Tools.getPerformanceVector(allAttributesVectors);
		if( allAttributesPerformanceVector != null ) {
			setAllAttributesResult( allAttributesPerformanceVector.getMainCriterion() );
			System.out.println(allAttributesPerformanceVector);
		}
				
		List<IOObject> result = new ArrayList<IOObject>();
		result.add( averagePerformance );
		if( getParameterAsBoolean(PARAMETER_AUTO_VALIDATION) ) result.add( autoPerformanceVector );
		if( getParameterAsBoolean(PARAMETER_ALL_ATTRIBUTES) ) result.add( allAttributesPerformanceVector );
		result.add( globalWeights );
		return result.toArray( new IOObject[] {} );
	}

	
	/**
	 * @param weights
	 * @return
	 */
	private int countSelectedAttributes(AttributeWeights weights) {
		return getSelectedAttributes( weights ).split("#").length;
	}

	/**
	 * @param weights
	 * @return
	 */
	private String getSelectedAttributes(AttributeWeights weights) {
		
		Set<String> attributes = weights.getAttributeNames();
		String selected = "";
		
		for( String s : attributes ){
			if( weights.getWeight( s ) > .99 ) selected += s + "#";
		}
		
		return selected;
	}
	

	private Operator getMethod() {
		return getOperator(0);
	}

	private Operator getEvaluator() {
		return getOperator(1);
	}

	
	void setResult(PerformanceCriterion pc) {
		lastPerformance = pc;
	}

	void setAutoResult(PerformanceCriterion pc) {
		lastAutoPerformance = pc;
	}
	
	void setAllAttributesResult(PerformanceCriterion pc) {
		lastAllAttributesPerformance = pc;
	}
	
	/** Applies the method. */
	IOContainer useMethod(ExampleSet methodTrainingSet) throws OperatorException {
		return methodResult = getMethod().apply(new IOContainer(new IOObject[] { methodTrainingSet }));
	}
	

	/** Applies the applier and evaluator. */
	IOContainer evaluate(ExampleSet testSet) throws OperatorException {
		if (methodResult == null) {
			throw new RuntimeException("Wrong use of MethodEvaluator.evaluate(ExampleSet): No preceding invocation of useMethod(ExampleSet)!");
		}
		IOContainer result = getEvaluator().apply(methodResult.append(new IOObject[] { testSet }));
		methodResult = null;
		return result;
	}
	
	
	
	private void handleWeights(AttributeWeights globalWeights, AttributeWeights currentWeights) {
		Iterator i = currentWeights.getAttributeNames().iterator();
		while (i.hasNext()) {
			String currentName = (String) i.next();
			double globalWeight = globalWeights.getWeight(currentName);
			double currentWeight = currentWeights.getWeight(currentName);
			if (Double.isNaN(globalWeight)) {
				globalWeights.setWeight(currentName, currentWeight);
			} else {
				globalWeights.setWeight(currentName, globalWeight + currentWeight);
			}
		}
	}

	
	public static int[] getRandomSubset( int n, RandomGenerator random ){
		int[] vec = new int[n];
		int[] res = new int[n/2];
		int pos, tmp;
		for( int i = 0; i < vec.length; i++ ) vec[i] = i;
		for( int i = 0; i < vec.length; i++ ){
			pos = random.nextInt( vec.length - i );
			if( i < n/2 ) res[i] = vec[ pos ];
			tmp = vec[ vec.length - i - 1];
			vec[ vec.length - i - 1] = vec[ pos ];
			vec[ pos ] = tmp;			
		}
		return res;
	}
	
	
	public List<ParameterType> getParameterTypes() {
		List<ParameterType> types = super.getParameterTypes();
		ParameterType type = new ParameterTypeInt(PARAMETER_NUMBER_OF_VALIDATIONS, "Number of iterations for the validation", 2, Integer.MAX_VALUE, 10);
		type.setExpert(false);
		types.add(type);
		types.add(new ParameterTypeInt(PARAMETER_LOCAL_RANDOM_SEED, "Use the given random seed instead of global random numbers (-1: use global)", -1, Integer.MAX_VALUE, -1));
		types.add(new ParameterTypeBoolean(PARAMETER_AUTO_VALIDATION, "Also evaluate performance for the classes used for attribute selection.", false));
		types.add(new ParameterTypeBoolean(PARAMETER_ALL_ATTRIBUTES, "Also evaluate performance with all attributes switched on.", false));
		
		//Evaluate performance also for the classes used for attribute selection.
		return types;
	}

}
