IGNITE-8663: Add Normalization Preprocessing support
authorzaleslaw <zaleslaw.sin@gmail.com>
Wed, 6 Jun 2018 12:28:53 +0000 (15:28 +0300)
committerYury Babak <ybabak@gridgain.com>
Wed, 6 Jun 2018 12:28:53 +0000 (15:28 +0300)
this closes #4117

28 files changed:
examples/src/main/java/org/apache/ignite/examples/ml/preprocessing/BinarizationExample.java
examples/src/main/java/org/apache/ignite/examples/ml/preprocessing/ImputingExample.java
examples/src/main/java/org/apache/ignite/examples/ml/preprocessing/ImputingExampleWithMostFrequentValues.java
examples/src/main/java/org/apache/ignite/examples/ml/preprocessing/MinMaxScalerExample.java [new file with mode: 0644]
examples/src/main/java/org/apache/ignite/examples/ml/preprocessing/NormalizationExample.java
examples/src/main/java/org/apache/ignite/examples/ml/regression/linear/LinearRegressionLSQRTrainerWithMinMaxScalerExample.java [moved from examples/src/main/java/org/apache/ignite/examples/ml/regression/linear/LinearRegressionLSQRTrainerWithNormalizationExample.java with 92% similarity]
examples/src/main/java/org/apache/ignite/examples/ml/regression/logistic/multiclass/LogRegressionMultiClassClassificationExample.java
examples/src/main/java/org/apache/ignite/examples/ml/svm/multiclass/SVMMultiClassClassificationExample.java
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/binarization/BinarizationTrainer.java
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputing/ImputerPartitionData.java [moved from modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputer/ImputerPartitionData.java with 96% similarity]
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputing/ImputerPreprocessor.java [moved from modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputer/ImputerPreprocessor.java with 97% similarity]
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputing/ImputerTrainer.java [moved from modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputer/ImputerTrainer.java with 97% similarity]
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputing/ImputingStrategy.java [moved from modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputer/ImputingStrategy.java with 90% similarity]
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputing/package-info.java [moved from modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/imputer/package-info.java with 94% similarity]
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerPartitionData.java [moved from modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/normalization/NormalizationPartitionData.java with 78% similarity]
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerPreprocessor.java [new file with mode: 0644]
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerTrainer.java [new file with mode: 0644]
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/package-info.java [new file with mode: 0644]
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/normalization/NormalizationPreprocessor.java
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/normalization/NormalizationTrainer.java
modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/normalization/package-info.java
modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/PreprocessingTestSuite.java
modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/imputing/ImputerPreprocessorTest.java
modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/imputing/ImputerTrainerTest.java
modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerPreprocessorTest.java [new file with mode: 0644]
modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerTrainerTest.java [new file with mode: 0644]
modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/normalization/NormalizationPreprocessorTest.java
modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/normalization/NormalizationTrainerTest.java

index edf4fd7..a8f2fa0 100644 (file)
@@ -31,10 +31,6 @@ import org.apache.ignite.ml.preprocessing.binarization.BinarizationTrainer;
 
 /**
  * Example that shows how to use binarization preprocessor to binarize data.
- *
- * Machine learning preprocessors are built as a chain. Most often a first preprocessor is a feature extractor as shown
- * in this example. The second preprocessor here is a normalization preprocessor which is built on top of the feature
- * extractor and represents a chain of itself and the underlying feature extractor.
  */
 public class BinarizationExample {
     /** Run example. */
@@ -54,8 +50,6 @@ public class BinarizationExample {
                 .withThreshold(40)
                 .fit(ignite, persons, featureExtractor);
 
-
-
             // Creates a cache based simple dataset containing features and providing standard dataset API.
             try (SimpleDataset<?> dataset = DatasetFactory.createSimpleDataset(ignite, persons, preprocessor)) {
                 // Calculation of the mean value. This calculation will be performed in map-reduce manner.
index e0c0d86..f873736 100644 (file)
@@ -27,14 +27,10 @@ import org.apache.ignite.examples.ml.dataset.model.Person;
 import org.apache.ignite.ml.dataset.DatasetFactory;
 import org.apache.ignite.ml.dataset.primitive.SimpleDataset;
 import org.apache.ignite.ml.math.functions.IgniteBiFunction;
-import org.apache.ignite.ml.preprocessing.imputer.ImputerTrainer;
+import org.apache.ignite.ml.preprocessing.imputing.ImputerTrainer;
 
 /**
- * Example that shows how to use binarization preprocessor to binarize data.
- *
- * Machine learning preprocessors are built as a chain. Most often a first preprocessor is a feature extractor as shown
- * in this example. The second preprocessor here is a normalization preprocessor which is built on top of the feature
- * extractor and represents a chain of itself and the underlying feature extractor.
+ * Example that shows how to use Imputing preprocessor to impute the missing value in the given data.
  */
 public class ImputingExample {
     /** Run example. */
index d25f6d0..2611c46 100644 (file)
@@ -27,15 +27,11 @@ import org.apache.ignite.examples.ml.dataset.model.Person;
 import org.apache.ignite.ml.dataset.DatasetFactory;
 import org.apache.ignite.ml.dataset.primitive.SimpleDataset;
 import org.apache.ignite.ml.math.functions.IgniteBiFunction;
-import org.apache.ignite.ml.preprocessing.imputer.ImputerTrainer;
-import org.apache.ignite.ml.preprocessing.imputer.ImputingStrategy;
+import org.apache.ignite.ml.preprocessing.imputing.ImputerTrainer;
+import org.apache.ignite.ml.preprocessing.imputing.ImputingStrategy;
 
 /**
- * Example that shows how to use binarization preprocessor to binarize data.
- *
- * Machine learning preprocessors are built as a chain. Most often a first preprocessor is a feature extractor as shown
- * in this example. The second preprocessor here is a normalization preprocessor which is built on top of the feature
- * extractor and represents a chain of itself and the underlying feature extractor.
+ * Example that shows how to use Imputing preprocessor to impute the missing values in the given data.
  */
 public class ImputingExampleWithMostFrequentValues {
     /** Run example. */
diff --git a/examples/src/main/java/org/apache/ignite/examples/ml/preprocessing/MinMaxScalerExample.java b/examples/src/main/java/org/apache/ignite/examples/ml/preprocessing/MinMaxScalerExample.java
new file mode 100644 (file)
index 0000000..e60b72b
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.examples.ml.preprocessing;
+
+import org.apache.ignite.Ignite;
+import org.apache.ignite.IgniteCache;
+import org.apache.ignite.Ignition;
+import org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction;
+import org.apache.ignite.configuration.CacheConfiguration;
+import org.apache.ignite.examples.ml.dataset.model.Person;
+import org.apache.ignite.ml.dataset.DatasetFactory;
+import org.apache.ignite.ml.dataset.primitive.SimpleDataset;
+import org.apache.ignite.ml.math.functions.IgniteBiFunction;
+import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerTrainer;
+
+import java.util.Arrays;
+
+/**
+ * Example that shows how to use MinMaxScaler preprocessor to scale the given data.
+ *
+ * Machine learning preprocessors are built as a chain. Most often a first preprocessor is a feature extractor as shown
+ * in this example. The second preprocessor here is a MinMaxScaler preprocessor which is built on top of the feature
+ * extractor and represents a chain of itself and the underlying feature extractor.
+ */
+public class MinMaxScalerExample {
+    /** Run example. */
+    public static void main(String[] args) throws Exception {
+        try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
+            System.out.println(">>> Normalization example started.");
+
+            IgniteCache<Integer, Person> persons = createCache(ignite);
+
+            // Defines first preprocessor that extracts features from an upstream data.
+            IgniteBiFunction<Integer, Person, double[]> featureExtractor = (k, v) -> new double[] {
+                v.getAge(),
+                v.getSalary()
+            };
+
+            // Defines second preprocessor that normalizes features.
+            IgniteBiFunction<Integer, Person, double[]> preprocessor = new MinMaxScalerTrainer<Integer, Person>()
+                .fit(ignite, persons, featureExtractor);
+
+            // Creates a cache based simple dataset containing features and providing standard dataset API.
+            try (SimpleDataset<?> dataset = DatasetFactory.createSimpleDataset(ignite, persons, preprocessor)) {
+                // Calculation of the mean value. This calculation will be performed in map-reduce manner.
+                double[] mean = dataset.mean();
+                System.out.println("Mean \n\t" + Arrays.toString(mean));
+
+                // Calculation of the standard deviation. This calculation will be performed in map-reduce manner.
+                double[] std = dataset.std();
+                System.out.println("Standard deviation \n\t" + Arrays.toString(std));
+
+                // Calculation of the covariance matrix.  This calculation will be performed in map-reduce manner.
+                double[][] cov = dataset.cov();
+                System.out.println("Covariance matrix ");
+                for (double[] row : cov)
+                    System.out.println("\t" + Arrays.toString(row));
+
+                // Calculation of the correlation matrix.  This calculation will be performed in map-reduce manner.
+                double[][] corr = dataset.corr();
+                System.out.println("Correlation matrix ");
+                for (double[] row : corr)
+                    System.out.println("\t" + Arrays.toString(row));
+            }
+
+            System.out.println(">>> Normalization example completed.");
+        }
+    }
+
+    /** */
+    private static IgniteCache<Integer, Person> createCache(Ignite ignite) {
+        CacheConfiguration<Integer, Person> cacheConfiguration = new CacheConfiguration<>();
+
+        cacheConfiguration.setName("PERSONS");
+        cacheConfiguration.setAffinity(new RendezvousAffinityFunction(false, 2));
+
+        IgniteCache<Integer, Person> persons = ignite.createCache(cacheConfiguration);
+
+        persons.put(1, new Person("Mike", 42, 10000));
+        persons.put(2, new Person("John", 32, 64000));
+        persons.put(3, new Person("George", 53, 120000));
+        persons.put(4, new Person("Karl", 24, 70000));
+
+        return persons;
+    }
+}
index b2c4e12..16169ab 100644 (file)
@@ -17,6 +17,7 @@
 
 package org.apache.ignite.examples.ml.preprocessing;
 
+import java.util.Arrays;
 import org.apache.ignite.Ignite;
 import org.apache.ignite.IgniteCache;
 import org.apache.ignite.Ignition;
@@ -26,16 +27,11 @@ import org.apache.ignite.examples.ml.dataset.model.Person;
 import org.apache.ignite.ml.dataset.DatasetFactory;
 import org.apache.ignite.ml.dataset.primitive.SimpleDataset;
 import org.apache.ignite.ml.math.functions.IgniteBiFunction;
+import org.apache.ignite.ml.preprocessing.binarization.BinarizationTrainer;
 import org.apache.ignite.ml.preprocessing.normalization.NormalizationTrainer;
 
-import java.util.Arrays;
-
 /**
- * Example that shows how to use normalization preprocessor to normalize data.
- *
- * Machine learning preprocessors are built as a chain. Most often a first preprocessor is a feature extractor as shown
- * in this example. The second preprocessor here is a normalization preprocessor which is built on top of the feature
- * extractor and represents a chain of itself and the underlying feature extractor.
+ * Example that shows how to use normalization preprocessor to normalize each vector in the given data.
  */
 public class NormalizationExample {
     /** Run example. */
@@ -53,6 +49,7 @@ public class NormalizationExample {
 
             // Defines second preprocessor that normalizes features.
             IgniteBiFunction<Integer, Person, double[]> preprocessor = new NormalizationTrainer<Integer, Person>()
+                .withP(1)
                 .fit(ignite, persons, featureExtractor);
 
             // Creates a cache based simple dataset containing features and providing standard dataset API.
@@ -91,10 +88,10 @@ public class NormalizationExample {
 
         IgniteCache<Integer, Person> persons = ignite.createCache(cacheConfiguration);
 
-        persons.put(1, new Person("Mike", 42, 10000));
-        persons.put(2, new Person("John", 32, 64000));
-        persons.put(3, new Person("George", 53, 120000));
-        persons.put(4, new Person("Karl", 24, 70000));
+        persons.put(1, new Person("Mike", 10, 20));
+        persons.put(2, new Person("John", 20, 10));
+        persons.put(3, new Person("George", 30, 0));
+        persons.put(4, new Person("Karl", 25, 15));
 
         return persons;
     }
@@ -26,8 +26,8 @@ import org.apache.ignite.cache.query.ScanQuery;
 import org.apache.ignite.configuration.CacheConfiguration;
 import org.apache.ignite.ml.math.functions.IgniteBiFunction;
 import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector;
-import org.apache.ignite.ml.preprocessing.normalization.NormalizationPreprocessor;
-import org.apache.ignite.ml.preprocessing.normalization.NormalizationTrainer;
+import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerPreprocessor;
+import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerTrainer;
 import org.apache.ignite.ml.regressions.linear.LinearRegressionLSQRTrainer;
 import org.apache.ignite.ml.regressions.linear.LinearRegressionModel;
 import org.apache.ignite.thread.IgniteThread;
@@ -40,10 +40,10 @@ import java.util.UUID;
  * Run linear regression model over cached dataset.
  *
  * @see LinearRegressionLSQRTrainer
- * @see NormalizationTrainer
- * @see NormalizationPreprocessor
+ * @see MinMaxScalerTrainer
+ * @see MinMaxScalerPreprocessor
  */
-public class LinearRegressionLSQRTrainerWithNormalizationExample {
+public class LinearRegressionLSQRTrainerWithMinMaxScalerExample {
     /** */
     private static final double[][] data = {
         {8, 78, 284, 9.100000381, 109},
@@ -110,13 +110,13 @@ public class LinearRegressionLSQRTrainerWithNormalizationExample {
             System.out.println(">>> Ignite grid started.");
 
             IgniteThread igniteThread = new IgniteThread(ignite.configuration().getIgniteInstanceName(),
-                LinearRegressionLSQRTrainerWithNormalizationExample.class.getSimpleName(), () -> {
+                LinearRegressionLSQRTrainerWithMinMaxScalerExample.class.getSimpleName(), () -> {
                 IgniteCache<Integer, double[]> dataCache = getTestCache(ignite);
 
-                System.out.println(">>> Create new normalization trainer object.");
-                NormalizationTrainer<Integer, double[]> normalizationTrainer = new NormalizationTrainer<>();
+                System.out.println(">>> Create new minmaxscaling trainer object.");
+                MinMaxScalerTrainer<Integer, double[]> normalizationTrainer = new MinMaxScalerTrainer<>();
 
-                System.out.println(">>> Perform the training to get the normalization preprocessor.");
+                System.out.println(">>> Perform the training to get the minmaxscaling preprocessor.");
                 IgniteBiFunction<Integer, double[], double[]> preprocessor = normalizationTrainer.fit(
                     ignite,
                     dataCache,
index f089923..61a711e 100644 (file)
@@ -32,7 +32,7 @@ import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector;
 import org.apache.ignite.ml.nn.UpdatesStrategy;
 import org.apache.ignite.ml.optimization.updatecalculators.SimpleGDParameterUpdate;
 import org.apache.ignite.ml.optimization.updatecalculators.SimpleGDUpdateCalculator;
-import org.apache.ignite.ml.preprocessing.normalization.NormalizationTrainer;
+import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerTrainer;
 import org.apache.ignite.ml.regressions.logistic.multiclass.LogRegressionMultiClassModel;
 import org.apache.ignite.ml.regressions.logistic.multiclass.LogRegressionMultiClassTrainer;
 import org.apache.ignite.ml.svm.SVMLinearMultiClassClassificationModel;
@@ -40,7 +40,7 @@ import org.apache.ignite.thread.IgniteThread;
 
 /**
  * Run Logistic Regression multi-class classification trainer over distributed dataset to build two models:
- * one with normalization and one without normalization.
+ * one with minmaxscaling and one without minmaxscaling.
  *
  * @see SVMLinearMultiClassClassificationModel
  */
@@ -78,7 +78,7 @@ public class LogRegressionMultiClassClassificationExample {
                 System.out.println(">>> SVM Multi-class model");
                 System.out.println(mdl.toString());
 
-                NormalizationTrainer<Integer, double[]> normalizationTrainer = new NormalizationTrainer<>();
+                MinMaxScalerTrainer<Integer, double[]> normalizationTrainer = new MinMaxScalerTrainer<>();
 
                 IgniteBiFunction<Integer, double[], double[]> preprocessor = normalizationTrainer.fit(
                     ignite,
@@ -93,7 +93,7 @@ public class LogRegressionMultiClassClassificationExample {
                     (k, v) -> v[0]
                 );
 
-                System.out.println(">>> Logistic Regression Multi-class model with normalization");
+                System.out.println(">>> Logistic Regression Multi-class model with minmaxscaling");
                 System.out.println(mdlWithNormalization.toString());
 
                 System.out.println(">>> ----------------------------------------------------------------");
@@ -128,7 +128,7 @@ public class LogRegressionMultiClassClassificationExample {
 
                         confusionMtx[idx1][idx2]++;
 
-                        // Collect data for model with normalization
+                        // Collect data for model with minmaxscaling
                         if(groundTruth != predictionWithNormalization)
                             amountOfErrorsWithNormalization++;
 
index 4054201..c2be971 100644 (file)
@@ -26,7 +26,7 @@ import org.apache.ignite.cache.query.ScanQuery;
 import org.apache.ignite.configuration.CacheConfiguration;
 import org.apache.ignite.ml.math.functions.IgniteBiFunction;
 import org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector;
-import org.apache.ignite.ml.preprocessing.normalization.NormalizationTrainer;
+import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerTrainer;
 import org.apache.ignite.ml.svm.SVMLinearMultiClassClassificationModel;
 import org.apache.ignite.ml.svm.SVMLinearMultiClassClassificationTrainer;
 import org.apache.ignite.thread.IgniteThread;
@@ -37,7 +37,7 @@ import java.util.UUID;
 
 /**
  * Run SVM multi-class classification trainer over distributed dataset to build two models:
- * one with normalization and one without normalization.
+ * one with minmaxscaling and one without minmaxscaling.
  *
  * @see SVMLinearMultiClassClassificationModel
  */
@@ -66,7 +66,7 @@ public class SVMMultiClassClassificationExample {
                 System.out.println(">>> SVM Multi-class model");
                 System.out.println(mdl.toString());
 
-                NormalizationTrainer<Integer, double[]> normalizationTrainer = new NormalizationTrainer<>();
+                MinMaxScalerTrainer<Integer, double[]> normalizationTrainer = new MinMaxScalerTrainer<>();
 
                 IgniteBiFunction<Integer, double[], double[]> preprocessor = normalizationTrainer.fit(
                     ignite,
@@ -81,7 +81,7 @@ public class SVMMultiClassClassificationExample {
                     (k, v) -> v[0]
                 );
 
-                System.out.println(">>> SVM Multi-class model with normalization");
+                System.out.println(">>> SVM Multi-class model with minmaxscaling");
                 System.out.println(mdlWithNormalization.toString());
 
                 System.out.println(">>> ----------------------------------------------------------------");
@@ -116,7 +116,7 @@ public class SVMMultiClassClassificationExample {
 
                         confusionMtx[idx1][idx2]++;
 
-                        // Collect data for model with normalization
+                        // Collect data for model with minmaxscaling
                         if(groundTruth != predictionWithNormalization)
                             amountOfErrorsWithNormalization++;
 
index 40060f4..abbf644 100644 (file)
 
 package org.apache.ignite.ml.preprocessing.binarization;
 
-import org.apache.ignite.ml.dataset.Dataset;
 import org.apache.ignite.ml.dataset.DatasetBuilder;
-import org.apache.ignite.ml.dataset.UpstreamEntry;
-import org.apache.ignite.ml.dataset.primitive.context.EmptyContext;
 import org.apache.ignite.ml.math.functions.IgniteBiFunction;
 import org.apache.ignite.ml.preprocessing.PreprocessingTrainer;
-import org.apache.ignite.ml.preprocessing.normalization.NormalizationPartitionData;
-import org.apache.ignite.ml.preprocessing.normalization.NormalizationPreprocessor;
 
 /**
  * Trainer of the binarization preprocessor.
  * limitations under the License.
  */
 
-package org.apache.ignite.ml.preprocessing.imputer;
+package org.apache.ignite.ml.preprocessing.imputing;
 
 import java.util.Map;
 
 /**
- * Partition data used in imputer preprocessor.
+ * Partition data used in imputing preprocessor.
  *
  * @see ImputerTrainer
  * @see ImputerPreprocessor
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.ignite.ml.preprocessing.imputer;
+package org.apache.ignite.ml.preprocessing.imputing;
 
 import java.util.Comparator;
 import java.util.HashMap;
@@ -29,8 +29,8 @@ import org.apache.ignite.ml.math.functions.IgniteBiFunction;
 import org.apache.ignite.ml.preprocessing.PreprocessingTrainer;
 
 /**
- * Trainer of the imputer preprocessor.
- * The imputer fills the missed values according the imputing strategy (default: mean value for each feature).
+ * Trainer of the imputing preprocessor.
+ * The imputing fills the missed values according the imputing strategy (default: mean value for each feature).
  * It supports double values in features only.
  *
  * @param <K> Type of a key in {@code upstream} data.
@@ -272,7 +272,7 @@ public class ImputerTrainer<K, V> implements PreprocessingTrainer<K, V, double[]
      * Sets the imputing strategy.
      *
      * @param imputingStgy The given value.
-     * @return The updated imputer trainer.
+     * @return The updated imputing trainer.
      */
     public ImputerTrainer<K, V> withImputingStrategy(ImputingStrategy imputingStgy){
         this.imputingStgy = imputingStgy;
@@ -15,9 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.ignite.ml.preprocessing.imputer;
+package org.apache.ignite.ml.preprocessing.imputing;
 
-/** This enum contains settings for imputer preprocessor. */
+/** This enum contains settings for imputing preprocessor. */
 public enum ImputingStrategy {
     /** The default strategy. If this strategy is chosen, then replace missing values using the mean for the numeric features along the axis. */
     MEAN,
  * limitations under the License.
  */
 
-package org.apache.ignite.ml.preprocessing.normalization;
+package org.apache.ignite.ml.preprocessing.minmaxscaling;
 
 /**
- * Partition data used in normalization preprocessor.
+ * Partition data used in minmaxscaling preprocessor.
  *
- * @see NormalizationTrainer
- * @see NormalizationPreprocessor
+ * @see MinMaxScalerTrainer
+ * @see MinMaxScalerPreprocessor
  */
-public class NormalizationPartitionData implements AutoCloseable {
+public class MinMaxScalerPartitionData implements AutoCloseable {
     /** Minimal values. */
     private final double[] min;
 
@@ -31,12 +31,12 @@ public class NormalizationPartitionData implements AutoCloseable {
     private final double[] max;
 
     /**
-     * Constructs a new instance of normalization partition data.
+     * Constructs a new instance of minmaxscaling partition data.
      *
      * @param min Minimal values.
      * @param max Maximum values.
      */
-    public NormalizationPartitionData(double[] min, double[] max) {
+    public MinMaxScalerPartitionData(double[] min, double[] max) {
         this.min = min;
         this.max = max;
     }
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerPreprocessor.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerPreprocessor.java
new file mode 100644 (file)
index 0000000..f75f927
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.ml.preprocessing.minmaxscaling;
+
+import org.apache.ignite.ml.math.functions.IgniteBiFunction;
+
+/**
+ * Preprocessing function that makes minmaxscaling. From mathematical point of view it's the following function which
+ * is applied to every element in dataset:
+ *
+ * {@code a_i = (a_i - min_i) / (max_i - min_i) for all i},
+ *
+ * where {@code i} is a number of column, {@code max_i} is the value of the maximum element in this columns,
+ * {@code min_i} is the value of the minimal element in this column.
+ *
+ * @param <K> Type of a key in {@code upstream} data.
+ * @param <V> Type of a value in {@code upstream} data.
+ */
+public class MinMaxScalerPreprocessor<K, V> implements IgniteBiFunction<K, V, double[]> {
+    /** */
+    private static final long serialVersionUID = 6997800576392623469L;
+
+    /** Minimal values. */
+    private final double[] min;
+
+    /** Maximum values. */
+    private final double[] max;
+
+    /** Base preprocessor. */
+    private final IgniteBiFunction<K, V, double[]> basePreprocessor;
+
+    /**
+     * Constructs a new instance of minmaxscaling preprocessor.
+     *
+     * @param min Minimal values.
+     * @param max Maximum values.
+     * @param basePreprocessor Base preprocessor.
+     */
+    public MinMaxScalerPreprocessor(double[] min, double[] max, IgniteBiFunction<K, V, double[]> basePreprocessor) {
+        this.min = min;
+        this.max = max;
+        this.basePreprocessor = basePreprocessor;
+    }
+
+    /**
+     * Applies this preprocessor.
+     *
+     * @param k Key.
+     * @param v Value.
+     * @return Preprocessed row.
+     */
+    @Override public double[] apply(K k, V v) {
+        double[] res = basePreprocessor.apply(k, v);
+
+        assert res.length == min.length;
+        assert res.length == max.length;
+
+        for (int i = 0; i < res.length; i++)
+            res[i] = (res[i] - min[i]) / (max[i] - min[i]);
+
+        return res;
+    }
+
+    /** */
+    public double[] getMin() {
+        return min;
+    }
+
+    /** */
+    public double[] getMax() {
+        return max;
+    }
+}
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerTrainer.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerTrainer.java
new file mode 100644 (file)
index 0000000..c8b547f
--- /dev/null
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.ml.preprocessing.minmaxscaling;
+
+import org.apache.ignite.ml.dataset.Dataset;
+import org.apache.ignite.ml.dataset.DatasetBuilder;
+import org.apache.ignite.ml.dataset.UpstreamEntry;
+import org.apache.ignite.ml.dataset.primitive.context.EmptyContext;
+import org.apache.ignite.ml.math.functions.IgniteBiFunction;
+import org.apache.ignite.ml.preprocessing.PreprocessingTrainer;
+
+/**
+ * Trainer of the minmaxscaling preprocessor.
+ *
+ * @param <K> Type of a key in {@code upstream} data.
+ * @param <V> Type of a value in {@code upstream} data.
+ */
+public class MinMaxScalerTrainer<K, V> implements PreprocessingTrainer<K, V, double[], double[]> {
+    /** {@inheritDoc} */
+    @Override public MinMaxScalerPreprocessor<K, V> fit(DatasetBuilder<K, V> datasetBuilder,
+        IgniteBiFunction<K, V, double[]> basePreprocessor) {
+        try (Dataset<EmptyContext, MinMaxScalerPartitionData> dataset = datasetBuilder.build(
+            (upstream, upstreamSize) -> new EmptyContext(),
+            (upstream, upstreamSize, ctx) -> {
+                double[] min = null;
+                double[] max = null;
+
+                while (upstream.hasNext()) {
+                    UpstreamEntry<K, V> entity = upstream.next();
+                    double[] row = basePreprocessor.apply(entity.getKey(), entity.getValue());
+
+                    if (min == null) {
+                        min = new double[row.length];
+                        for (int i = 0; i < min.length; i++)
+                            min[i] = Double.MAX_VALUE;
+                    }
+                    else
+                        assert min.length == row.length : "Base preprocessor must return exactly " + min.length
+                            + " features";
+
+                    if (max == null) {
+                        max = new double[row.length];
+                        for (int i = 0; i < max.length; i++)
+                            max[i] = -Double.MAX_VALUE;
+                    }
+                    else
+                        assert max.length == row.length : "Base preprocessor must return exactly " + min.length
+                            + " features";
+
+                    for (int i = 0; i < row.length; i++) {
+                        if (row[i] < min[i])
+                            min[i] = row[i];
+                        if (row[i] > max[i])
+                            max[i] = row[i];
+                    }
+                }
+
+                return new MinMaxScalerPartitionData(min, max);
+            }
+        )) {
+            double[][] minMax = dataset.compute(
+                data -> data.getMin() != null ? new double[][]{ data.getMin(), data.getMax() } : null,
+                (a, b) -> {
+                    if (a == null)
+                        return b;
+
+                    if (b == null)
+                        return a;
+
+                    double[][] res = new double[2][];
+
+                    res[0] = new double[a[0].length];
+                    for (int i = 0; i < res[0].length; i++)
+                        res[0][i] = Math.min(a[0][i], b[0][i]);
+
+                    res[1] = new double[a[1].length];
+                    for (int i = 0; i < res[1].length; i++)
+                        res[1][i] = Math.max(a[1][i], b[1][i]);
+
+                    return res;
+                }
+            );
+
+            return new MinMaxScalerPreprocessor<>(minMax[0], minMax[1], basePreprocessor);
+        }
+        catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
diff --git a/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/package-info.java b/modules/ml/src/main/java/org/apache/ignite/ml/preprocessing/minmaxscaling/package-info.java
new file mode 100644 (file)
index 0000000..fdc4591
--- /dev/null
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * <!-- Package description. -->
+ * Contains Min Max Scaler preprocessor.
+ */
+package org.apache.ignite.ml.preprocessing.minmaxscaling;
index 7c94b8f..89186e0 100644 (file)
 
 package org.apache.ignite.ml.preprocessing.normalization;
 
+import org.apache.ignite.ml.math.functions.Functions;
 import org.apache.ignite.ml.math.functions.IgniteBiFunction;
+import org.apache.ignite.ml.math.functions.IgniteDoubleFunction;
 
 /**
- * Preprocessing function that makes normalization. From mathematical point of view it's the following function which
- * is applied to every element in dataset:
+ * Preprocessing function that makes normalization.
  *
- * {@code a_i = (a_i - min_i) / (max_i - min_i) for all i},
- *
- * where {@code i} is a number of column, {@code max_i} is the value of the maximum element in this columns,
- * {@code min_i} is the value of the minimal element in this column.
+ * Normalization is the process of scaling individual samples to have unit norm.
+ * This process can be useful if you plan to use a quadratic form such as the dot-product or any other kernel
+ * to quantify the similarity of any pair of samples.
  *
  * @param <K> Type of a key in {@code upstream} data.
  * @param <V> Type of a value in {@code upstream} data.
  */
 public class NormalizationPreprocessor<K, V> implements IgniteBiFunction<K, V, double[]> {
     /** */
-    private static final long serialVersionUID = 6997800576392623469L;
-
-    /** Minimal values. */
-    private final double[] min;
+    private static final long serialVersionUID = 6873438115778921295L;
 
-    /** Maximum values. */
-    private final double[] max;
+    /** Normalization in L^p space. Must be greater than 0. Default value is 2. */
+    private int p = 2;
 
     /** Base preprocessor. */
     private final IgniteBiFunction<K, V, double[]> basePreprocessor;
 
     /**
-     * Constructs a new instance of normalization preprocessor.
+     * Constructs a new instance of Normalization preprocessor.
      *
-     * @param min Minimal values.
-     * @param max Maximum values.
+     * @param p Degree of L^p space value.
      * @param basePreprocessor Base preprocessor.
      */
-    public NormalizationPreprocessor(double[] min, double[] max, IgniteBiFunction<K, V, double[]> basePreprocessor) {
-        this.min = min;
-        this.max = max;
+    public NormalizationPreprocessor(int p, IgniteBiFunction<K, V, double[]> basePreprocessor) {
+        this.p = p;
         this.basePreprocessor = basePreprocessor;
     }
 
@@ -67,22 +62,32 @@ public class NormalizationPreprocessor<K, V> implements IgniteBiFunction<K, V, d
     @Override public double[] apply(K k, V v) {
         double[] res = basePreprocessor.apply(k, v);
 
-        assert res.length == min.length;
-        assert res.length == max.length;
+        double pNorm = Math.pow(foldMap(res, Functions.PLUS, Functions.pow(p), 0d), 1.0 / p);
 
         for (int i = 0; i < res.length; i++)
-            res[i] = (res[i] - min[i]) / (max[i] - min[i]);
+            res[i] /= pNorm;
 
         return res;
     }
 
-    /** */
-    public double[] getMin() {
-        return min;
+    /**
+     * Folds given array into a single value.
+     * @param vec The given array.
+     * @param foldFun Folding function that takes two parameters: accumulator and the current value.
+     * @param mapFun Mapping function that is called on each vector element before its passed to the accumulator (as its
+     * second parameter).
+     * @param zero Zero value for fold operation.
+     * @return Folded value of this vector.
+     */
+    private double foldMap(double[] vec, IgniteBiFunction<Double,Double,Double> foldFun, IgniteDoubleFunction<Double> mapFun, double zero) {
+        for (double feature : vec)
+            zero = foldFun.apply(zero, mapFun.apply(feature));
+
+        return zero;
     }
 
-    /** */
-    public double[] getMax() {
-        return max;
+    /** Gets the degree of L^p space parameter value. */
+    public double p() {
+        return p;
     }
 }
index 57acbad..5db4218 100644 (file)
 
 package org.apache.ignite.ml.preprocessing.normalization;
 
-import org.apache.ignite.ml.dataset.Dataset;
 import org.apache.ignite.ml.dataset.DatasetBuilder;
-import org.apache.ignite.ml.dataset.UpstreamEntry;
-import org.apache.ignite.ml.dataset.primitive.context.EmptyContext;
 import org.apache.ignite.ml.math.functions.IgniteBiFunction;
 import org.apache.ignite.ml.preprocessing.PreprocessingTrainer;
 
 /**
- * Trainer of the normalization preprocessor.
+ * Trainer of the Normalization preprocessor.
  *
  * @param <K> Type of a key in {@code upstream} data.
  * @param <V> Type of a value in {@code upstream} data.
  */
 public class NormalizationTrainer<K, V> implements PreprocessingTrainer<K, V, double[], double[]> {
+    /**  Normalization in L^p space. Must be greater than 0. Default value is 2. */
+    private int p = 2;
+
     /** {@inheritDoc} */
     @Override public NormalizationPreprocessor<K, V> fit(DatasetBuilder<K, V> datasetBuilder,
         IgniteBiFunction<K, V, double[]> basePreprocessor) {
-        try (Dataset<EmptyContext, NormalizationPartitionData> dataset = datasetBuilder.build(
-            (upstream, upstreamSize) -> new EmptyContext(),
-            (upstream, upstreamSize, ctx) -> {
-                double[] min = null;
-                double[] max = null;
-
-                while (upstream.hasNext()) {
-                    UpstreamEntry<K, V> entity = upstream.next();
-                    double[] row = basePreprocessor.apply(entity.getKey(), entity.getValue());
-
-                    if (min == null) {
-                        min = new double[row.length];
-                        for (int i = 0; i < min.length; i++)
-                            min[i] = Double.MAX_VALUE;
-                    }
-                    else
-                        assert min.length == row.length : "Base preprocessor must return exactly " + min.length
-                            + " features";
-
-                    if (max == null) {
-                        max = new double[row.length];
-                        for (int i = 0; i < max.length; i++)
-                            max[i] = -Double.MAX_VALUE;
-                    }
-                    else
-                        assert max.length == row.length : "Base preprocessor must return exactly " + min.length
-                            + " features";
-
-                    for (int i = 0; i < row.length; i++) {
-                        if (row[i] < min[i])
-                            min[i] = row[i];
-                        if (row[i] > max[i])
-                            max[i] = row[i];
-                    }
-                }
-
-                return new NormalizationPartitionData(min, max);
-            }
-        )) {
-            double[][] minMax = dataset.compute(
-                data -> data.getMin() != null ? new double[][]{ data.getMin(), data.getMax() } : null,
-                (a, b) -> {
-                    if (a == null)
-                        return b;
-
-                    if (b == null)
-                        return a;
-
-                    double[][] res = new double[2][];
-
-                    res[0] = new double[a[0].length];
-                    for (int i = 0; i < res[0].length; i++)
-                        res[0][i] = Math.min(a[0][i], b[0][i]);
-
-                    res[1] = new double[a[1].length];
-                    for (int i = 0; i < res[1].length; i++)
-                        res[1][i] = Math.max(a[1][i], b[1][i]);
+        return new NormalizationPreprocessor<>(p, basePreprocessor);
+    }
 
-                    return res;
-                }
-            );
+    /**
+     * Gets the degree of L space parameter value.
+     * @return The parameter value.
+     */
+    public double p() {
+        return p;
+    }
 
-            return new NormalizationPreprocessor<>(minMax[0], minMax[1], basePreprocessor);
-        }
-        catch (Exception e) {
-            throw new RuntimeException(e);
-        }
+    /**
+     * Sets the p parameter value. Must be greater than 0.
+     *
+     * @param p The given value.
+     * @return The Normalization trainer.
+     */
+    public NormalizationTrainer<K, V> withP(int p) {
+        assert p > 0;
+        this.p = p;
+        return this;
     }
 }
index 5c3146f..18c46f6 100644 (file)
@@ -17,6 +17,6 @@
 
 /**
  * <!-- Package description. -->
- * Contains standardization preprocessor.
+ * Contains Normalizer preprocessor.
  */
-package org.apache.ignite.ml.preprocessing.normalization;
\ No newline at end of file
+package org.apache.ignite.ml.preprocessing.normalization;
index cb29ecb..b13ed7d 100644 (file)
@@ -23,6 +23,8 @@ import org.apache.ignite.ml.preprocessing.encoding.StringEncoderPreprocessorTest
 import org.apache.ignite.ml.preprocessing.encoding.StringEncoderTrainerTest;
 import org.apache.ignite.ml.preprocessing.imputing.ImputerPreprocessorTest;
 import org.apache.ignite.ml.preprocessing.imputing.ImputerTrainerTest;
+import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerPreprocessorTest;
+import org.apache.ignite.ml.preprocessing.minmaxscaling.MinMaxScalerTrainerTest;
 import org.apache.ignite.ml.preprocessing.normalization.NormalizationPreprocessorTest;
 import org.apache.ignite.ml.preprocessing.normalization.NormalizationTrainerTest;
 import org.junit.runner.RunWith;
@@ -33,14 +35,17 @@ import org.junit.runners.Suite;
  */
 @RunWith(Suite.class)
 @Suite.SuiteClasses({
-    NormalizationPreprocessorTest.class,
-    NormalizationTrainerTest.class,
+    MinMaxScalerPreprocessorTest.class,
+    MinMaxScalerTrainerTest.class,
     BinarizationPreprocessorTest.class,
     BinarizationTrainerTest.class,
     ImputerPreprocessorTest.class,
     ImputerTrainerTest.class,
     StringEncoderTrainerTest.class,
-    StringEncoderPreprocessorTest.class
+    StringEncoderPreprocessorTest.class,
+    NormalizationTrainerTest.class,
+    NormalizationPreprocessorTest.class
+
 })
 public class PreprocessingTestSuite {
     // No-op.
index e3fd4cf..f0f56d3 100644 (file)
@@ -17,8 +17,6 @@
 
 package org.apache.ignite.ml.preprocessing.imputing;
 
-import org.apache.ignite.ml.preprocessing.binarization.BinarizationPreprocessor;
-import org.apache.ignite.ml.preprocessing.imputer.ImputerPreprocessor;
 import org.junit.Test;
 
 import static org.junit.Assert.assertArrayEquals;
index 06e52fa..a4bb847 100644 (file)
@@ -22,9 +22,6 @@ import java.util.HashMap;
 import java.util.Map;
 import org.apache.ignite.ml.dataset.DatasetBuilder;
 import org.apache.ignite.ml.dataset.impl.local.LocalDatasetBuilder;
-import org.apache.ignite.ml.preprocessing.imputer.ImputerPreprocessor;
-import org.apache.ignite.ml.preprocessing.imputer.ImputerTrainer;
-import org.apache.ignite.ml.preprocessing.imputer.ImputingStrategy;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerPreprocessorTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerPreprocessorTest.java
new file mode 100644 (file)
index 0000000..5ce21d4
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.ml.preprocessing.minmaxscaling;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertArrayEquals;
+
+/**
+ * Tests for {@link MinMaxScalerPreprocessor}.
+ */
+public class MinMaxScalerPreprocessorTest {
+    /** Tests {@code apply()} method. */
+    @Test
+    public void testApply() {
+        double[][] data = new double[][]{
+            {2., 4., 1.},
+            {1., 8., 22.},
+            {4., 10., 100.},
+            {0., 22., 300.}
+        };
+
+        MinMaxScalerPreprocessor<Integer, double[]> preprocessor = new MinMaxScalerPreprocessor<>(
+            new double[] {0, 4, 1},
+            new double[] {4, 22, 300},
+            (k, v) -> v
+        );
+
+        double[][] standardData = new double[][]{
+            {2. / 4, (4. - 4.) / 18.,  0.},
+            {1. / 4, (8. - 4.) / 18.,  (22. - 1.) / 299.},
+            {1.,     (10. - 4.) / 18., (100. - 1.) / 299.},
+            {0.,     (22. - 4.) / 18., (300. - 1.) / 299.}
+        };
+
+       for (int i = 0; i < data.length; i++)
+           assertArrayEquals(standardData[i], preprocessor.apply(i, data[i]), 1e-8);
+    }
+}
diff --git a/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerTrainerTest.java b/modules/ml/src/test/java/org/apache/ignite/ml/preprocessing/minmaxscaling/MinMaxScalerTrainerTest.java
new file mode 100644 (file)
index 0000000..e411dca
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.ml.preprocessing.minmaxscaling;
+
+import org.apache.ignite.ml.dataset.DatasetBuilder;
+import org.apache.ignite.ml.dataset.impl.local.LocalDatasetBuilder;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.junit.Assert.assertArrayEquals;
+
+/**
+ * Tests for {@link MinMaxScalerTrainer}.
+ */
+@RunWith(Parameterized.class)
+public class MinMaxScalerTrainerTest {
+    /** Parameters. */
+    @Parameterized.Parameters(name = "Data divided on {0} partitions")
+    public static Iterable<Integer[]> data() {
+        return Arrays.asList(
+            new Integer[] {1},
+            new Integer[] {2},
+            new Integer[] {3},
+            new Integer[] {5},
+            new Integer[] {7},
+            new Integer[] {100},
+            new Integer[] {1000}
+        );
+    }
+
+    /** Number of partitions. */
+    @Parameterized.Parameter
+    public int parts;
+
+    /** Tests {@code fit()} method. */
+    @Test
+    public void testFit() {
+        Map<Integer, double[]> data = new HashMap<>();
+        data.put(1, new double[] {2, 4, 1});
+        data.put(2, new double[] {1, 8, 22});
+        data.put(3, new double[] {4, 10, 100});
+        data.put(4, new double[] {0, 22, 300});
+
+        DatasetBuilder<Integer, double[]> datasetBuilder = new LocalDatasetBuilder<>(data, parts);
+
+        MinMaxScalerTrainer<Integer, double[]> standardizationTrainer = new MinMaxScalerTrainer<>();
+
+        MinMaxScalerPreprocessor<Integer, double[]> preprocessor = standardizationTrainer.fit(
+            datasetBuilder,
+            (k, v) -> v
+        );
+
+        assertArrayEquals(new double[] {0, 4, 1}, preprocessor.getMin(), 1e-8);
+        assertArrayEquals(new double[] {4, 22, 300}, preprocessor.getMax(), 1e-8);
+    }
+}
index c9eb765..f3bf81f 100644 (file)
 
 package org.apache.ignite.ml.preprocessing.normalization;
 
+import org.apache.ignite.ml.preprocessing.binarization.BinarizationPreprocessor;
 import org.junit.Test;
 
 import static org.junit.Assert.assertArrayEquals;
 
 /**
- * Tests for {@link NormalizationPreprocessor}.
+ * Tests for {@link BinarizationPreprocessor}.
  */
 public class NormalizationPreprocessorTest {
     /** Tests {@code apply()} method. */
     @Test
     public void testApply() {
         double[][] data = new double[][]{
-            {2., 4., 1.},
-            {1., 8., 22.},
-            {4., 10., 100.},
-            {0., 22., 300.}
+            {1, 2, 1},
+            {1, 1, 1},
+            {1, 0, 0},
         };
 
         NormalizationPreprocessor<Integer, double[]> preprocessor = new NormalizationPreprocessor<>(
-            new double[] {0, 4, 1},
-            new double[] {4, 22, 300},
+            1,
             (k, v) -> v
         );
 
-        double[][] standardData = new double[][]{
-            {2. / 4, (4. - 4.) / 18.,  0.},
-            {1. / 4, (8. - 4.) / 18.,  (22. - 1.) / 299.},
-            {1.,     (10. - 4.) / 18., (100. - 1.) / 299.},
-            {0.,     (22. - 4.) / 18., (300. - 1.) / 299.}
+        double[][] postProcessedData = new double[][]{
+            {0.25, 0.5, 0.25},
+            {0.33, 0.33, 0.33},
+            {1, 0, 0}
         };
 
        for (int i = 0; i < data.length; i++)
-           assertArrayEquals(standardData[i], preprocessor.apply(i, data[i]), 1e-8);
+           assertArrayEquals(postProcessedData[i], preprocessor.apply(i, data[i]), 1e-2);
     }
 }
index e7a0d47..ef86b07 100644 (file)
 
 package org.apache.ignite.ml.preprocessing.normalization;
 
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
 import org.apache.ignite.ml.dataset.DatasetBuilder;
 import org.apache.ignite.ml.dataset.impl.local.LocalDatasetBuilder;
+import org.apache.ignite.ml.preprocessing.binarization.BinarizationPreprocessor;
+import org.apache.ignite.ml.preprocessing.binarization.BinarizationTrainer;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-
 import static org.junit.Assert.assertArrayEquals;
 
 /**
- * Tests for {@link NormalizationTrainer}.
+ * Tests for {@link BinarizationTrainer}.
  */
 @RunWith(Parameterized.class)
 public class NormalizationTrainerTest {
@@ -63,14 +64,14 @@ public class NormalizationTrainerTest {
 
         DatasetBuilder<Integer, double[]> datasetBuilder = new LocalDatasetBuilder<>(data, parts);
 
-        NormalizationTrainer<Integer, double[]> standardizationTrainer = new NormalizationTrainer<>();
+        NormalizationTrainer<Integer, double[]> normalizationTrainer = new NormalizationTrainer<Integer, double[]>()
+            .withP(3);
 
-        NormalizationPreprocessor<Integer, double[]> preprocessor = standardizationTrainer.fit(
+        NormalizationPreprocessor<Integer, double[]> preprocessor = normalizationTrainer.fit(
             datasetBuilder,
             (k, v) -> v
         );
 
-        assertArrayEquals(new double[] {0, 4, 1}, preprocessor.getMin(), 1e-8);
-        assertArrayEquals(new double[] {4, 22, 300}, preprocessor.getMax(), 1e-8);
+        assertArrayEquals(new double[] {0.125, 0.99, 0.125}, preprocessor.apply(5, new double[] {1, 8, 1}), 1e-2);
     }
 }