diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMapping.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMapping.cs new file mode 100644 index 0000000000..a6a2ce262a --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMapping.cs @@ -0,0 +1,85 @@ +using System; +using System.Collections.Generic; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class ValueMappingExample + { + class SampleInfertDataWithFeatures + { + public float Age = 0; + public string Education = default; + public string EducationCategory = default; + } + + /// This example demonstrates the use of the ValueMappingEstimator by mapping string-to-string values. This is useful + /// to map strings to a grouping. In this example, the education data maps to the groups Undergraduate and Postgraduate: + /// 0-5yrs -> Undergraduate + /// 6-11yrs -> Postgraduate + /// 12+yrs -> Postgraduate + /// Its possible to have multiple keys map to the same value. + public static void Run() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); + IDataView trainData = mlContext.Data.ReadFromEnumerable(data); + + // Preview of the data. + // + // Age Case Education induced parity pooled.stratum row_num ... + // 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... + // 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... + // 39.0 1.0 12+yrs 2.0 6.0 4.0 3.0 ... + // 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... + // 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... + + // If the list of keys and values are known, they can be passed to the API. The ValueMappingEstimator can also get the mapping through an IDataView + // Creating a list of keys based on the Education values from the dataset. + var educationKeys = new List() + { + "0-5yrs", + "6-11yrs", + "12+yrs" + }; + + // Creating a list of associated values that will map respectively to each educationKey + var educationValues = new List() + { + "Undergraduate", + "Postgraduate", + "Postgraduate" + }; + + // Constructs the ValueMappingEstimator making the ML.net pipeline + var pipeline = mlContext.Transforms.Conversion.ValueMap(educationKeys, educationValues, ("EducationCategory", "Education")); + + // Fits the ValueMappingEstimator and transforms the data converting the Education to EducationCategory. + IDataView transformedData = pipeline.Fit(trainData).Transform(trainData); + + // Getting the resulting data as an IEnumerable of SampleInfertDataWithFeatures. This will contain the newly created column EducationCategory + IEnumerable featureRows = mlContext.CreateEnumerable(transformedData, reuseRowObject: false); + + Console.WriteLine($"Example of mapping string->string"); + Console.WriteLine($"Age\tEducation\tEducationCategory"); + foreach (var featureRow in featureRows) + { + Console.WriteLine($"{featureRow.Age}\t{featureRow.Education} \t{featureRow.EducationCategory}"); + } + + // Features column obtained post-transformation. + // + // Age Education EducationCategory + // 26 0-5yrs Undergraduate + // 42 0-5yrs Undergraudate + // 39 12+yrs Postgraduate + // 34 0-5yrs Undergraduate + // 35 6-11yrs Postgraduate + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingFloatToString.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingFloatToString.cs new file mode 100644 index 0000000000..6d0c69d87c --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingFloatToString.cs @@ -0,0 +1,79 @@ +using System; +using System.Collections.Generic; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class ValueMappingFloatToStringExample + { + /// + /// Helper class for retrieving the resulting data + /// + class SampleTemperatureDataWithCategory + { + public DateTime Date = default; + public float Temperature = 0.0f; + public string TemperatureCategory = default; + } + + /// This example demonstrates the use of ValueMappingEstimator by mapping float-to-string values. This is useful if the key + /// data are floating point and need to be grouped into string values. In this example, the Induction value is mapped to + /// "T1", "T2", "T3", and "T4" groups. + public static void Run() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetSampleTemperatureData(); + IDataView trainData = mlContext.Data.ReadFromEnumerable(data); + + // If the list of keys and values are known, they can be passed to the API. The ValueMappingEstimator can also get the mapping through an IDataView + // Creating a list of keys based on the induced value from the dataset + var temperatureKeys = new List() + { + 39.0F, + 67.0F, + 75.0F, + 82.0F, + }; + + // Creating a list of values, these strings will map accordingly to each key. + var classificationValues = new List() + { + "T1", + "T2", + "T3", + "T4" + }; + + // Constructs the ValueMappingEstimator making the ML.net pipeline + var pipeline = mlContext.Transforms.Conversion.ValueMap(temperatureKeys, classificationValues, ("TemperatureCategory", "Temperature")); + + // Fits the ValueMappingEstimator and transforms the data adding the TemperatureCategory column. + IDataView transformedData = pipeline.Fit(trainData).Transform(trainData); + + // Getting the resulting data as an IEnumerable of SampleTemperatureDataWithCategory. This will contain the newly created column TemperatureCategory + IEnumerable featureRows = mlContext.CreateEnumerable(transformedData, reuseRowObject: false); + + Console.WriteLine($"Example of mapping float->string"); + Console.WriteLine($"Date\t\tTemperature\tTemperatureCategory"); + foreach (var featureRow in featureRows) + { + Console.WriteLine($"{featureRow.Date.ToString("d")}\t{featureRow.Temperature}\t\t{featureRow.TemperatureCategory}"); + } + + // Features column obtained post-transformation. + // + // Example of mapping float->string + // Date Temperature TemperatureCategory + // 1/1/2012 39 T1 + // 1/2/2012 82 T4 + // 1/3/2012 75 T3 + // 1/4/2012 67 T2 + // 1/5/2012 75 T3 + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToArray.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToArray.cs new file mode 100644 index 0000000000..e15049b460 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToArray.cs @@ -0,0 +1,81 @@ +using System; +using System.Collections.Generic; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms.Conversions; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class ValueMappingStringToArrayExample + { + /// + /// Helper class for retrieving the resulting data + /// + class SampleInfertDataWithIntArray + { + public float Age = 0; + public string Education = default; + public int[] EducationFeature = default; + } + + /// This example demonstrates the use of the ValueMappingEstimator by mapping string-to-array values which allows for mapping string data + /// to numeric arrays that can then be used as a feature set for a trainer. In this example, we are mapping the education data to + /// arbitrary integer arrays with the following association: + /// 0-5yrs -> 1, 2, 3 + /// 6-11yrs -> 5, 6, 7 + /// 12+yrs -> 42,32,64 + public static void Run() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); + IDataView trainData = mlContext.Data.ReadFromEnumerable(data); + + // If the list of keys and values are known, they can be passed to the API. The ValueMappingEstimator can also get the mapping through an IDataView + // Creating a list of keys based on the Education values from the dataset + var educationKeys = new List() + { + "0-5yrs", + "6-11yrs", + "12+yrs" + }; + + // Sample list of associated array values + var educationValues = new List() + { + new int[] { 1,2,3 }, + new int[] { 5,6,7 }, + new int[] { 42,32,64 } + }; + + // Constructs the ValueMappingEstimator making the ML.net pipeline + var pipeline = new ValueMappingEstimator(mlContext, educationKeys, educationValues, ("EducationFeature", "Education")); + + // Fits the ValueMappingEstimator and transforms the data adding the EducationFeature column. + IDataView transformedData = pipeline.Fit(trainData).Transform(trainData); + + // Getting the resulting data as an IEnumerable of SampleInfertDataWithIntArray. This will contain the newly created column EducationCategory + IEnumerable featuresColumn = mlContext.CreateEnumerable(transformedData, reuseRowObject: false); + + Console.WriteLine($"Example of mapping string->array"); + Console.WriteLine($"Age\tEducation\tEducationFeature"); + foreach (var featureRow in featuresColumn) + { + Console.WriteLine($"{featureRow.Age}\t{featureRow.Education} \t{string.Join(",", featureRow.EducationFeature)}"); + } + + // Features column obtained post-transformation. + // + // Example of mapping string->array + // Age Education EducationFeature + // 26 0 - 5yrs 1,2,3 + // 42 0 - 5yrs 1,2,3 + // 39 12 + yrs 42,32,64 + // 34 0 - 5yrs 1,2,3 + // 35 6 - 11yrs 5,6,7 + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs new file mode 100644 index 0000000000..39a14d2442 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs @@ -0,0 +1,87 @@ +using System; +using System.Collections.Generic; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms.Conversions; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class ValueMappingStringToKeyTypeExample + { + /// + /// Helper class for retrieving the resulting data + /// + class SampleInfertDataWithFeatures + + { + public float Age = 0; + public string Education = default; + public string EducationCategory = default; + } + + /// This example demonstrates the use of KeyTypes using both the ValueMappingEstimator and KeyToValueEstimator. Using a KeyType + /// instead of the actual value provides a unique integer representation of the value. When the treatValueAsKeyTypes is true, + /// the ValueMappingEstimator will generate a KeyType for each unique value. + /// + /// In this example, the education data is mapped to a grouping of 'Undergraduate' and 'Postgraduate'. Because KeyTypes are used, the + /// ValueMappingEstimator will output the KeyType value rather than string value of 'Undergraduate' or 'Postgraduate'. + /// + /// The KeyToValueEstimator is added to the pipeline to convert the KeyType back to the original value. Therefore the output of this example + /// results in the string value of 'Undergraduate' and 'Postgraduate'. + public static void Run() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetInfertData(); + IDataView trainData = mlContext.Data.ReadFromEnumerable(data); + + // Creating a list of keys based on the Education values from the dataset + // These lists are created by hand for the demonstration, but the ValueMappingEstimator does take an IEnumerable. + var educationKeys = new List() + { + "0-5yrs", + "6-11yrs", + "12+yrs" + }; + + // Creating a list of values that are sample strings. These will be converted to KeyTypes + var educationValues = new List() + { + "Undergraduate", + "Postgraduate", + "Postgraduate" + }; + + // Generate the ValueMappingEstimator that will output KeyTypes even though our values are strings. + // The KeyToValueMappingEstimator is added to provide a reverse lookup of the KeyType, converting the KeyType value back + // to the original value. + var pipeline = new ValueMappingEstimator(mlContext, educationKeys, educationValues, true, ("EducationKeyType", "Education")) + .Append(new KeyToValueMappingEstimator(mlContext, ("EducationCategory", "EducationKeyType"))); + + // Fits the ValueMappingEstimator and transforms the data adding the EducationKeyType column. + IDataView transformedData = pipeline.Fit(trainData).Transform(trainData); + + // Getting the resulting data as an IEnumerable of SampleInfertDataWithFeatures. + IEnumerable featureRows = mlContext.CreateEnumerable(transformedData, reuseRowObject: false); + + Console.WriteLine($"Example of mapping string->keytype"); + Console.WriteLine($"Age\tEducation\tEducationCategory"); + foreach (var featureRow in featureRows) + { + Console.WriteLine($"{featureRow.Age}\t{featureRow.Education} \t{featureRow.EducationCategory}"); + } + + // Features column obtained post-transformation. + // + // Age Education EducationCategory + // 26 0-5yrs Undergraduate + // 42 0-5yrs Undergraduate + // 39 12+yrs Postgraduate + // 34 0-5yrs Undergraduate + // 35 6-11yrs Postgraduate + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index cdcd3a83f2..f83e5fd95e 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -63,6 +63,12 @@ public static TypeConvertingEstimator ConvertType(this TransformsCatalog.Convers /// /// The categorical transform's catalog. /// Name of the column to transform. + /// + /// + /// + /// public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, string inputColumnName) => new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumnName); @@ -72,6 +78,12 @@ public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.Co /// /// The categorical transform's catalog /// The pairs of input and output columns. + /// + /// + /// + /// public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, params (string outputColumnName, string inputColumnName)[] columns) => new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns); @@ -124,7 +136,7 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns, keyData); /// - /// Maps specified keys to specified values + /// /// /// The key type. /// The value type. @@ -133,7 +145,16 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co /// cannot contain duplicate keys. /// The list of values to pair with the keys for the mapping. This list must be equal to the same length as keys. /// The columns to apply this transform on. - /// + /// An instance of the ValueMappingEstimator + /// + /// + /// + /// public static ValueMappingEstimator ValueMap( this TransformsCatalog.ConversionTransforms catalog, IEnumerable keys, @@ -142,17 +163,23 @@ public static ValueMappingEstimator ValueMap new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, columns); /// - /// Maps the using the keys in the dictionary to the values of dictionary i.e. - /// a value 'x' in the would be mappped to a value stored in dictionary[x]. - /// In this case, the is used to build up the dictionary where - /// and specify the keys and values of dictionary respectively. + /// /// /// The categorical transform's catalog /// An instance of that contains the key and value columns. /// Name of the key column in . /// Name of the value column in . /// The columns to apply this transform on. - /// + /// A instance of the ValueMappingEstimator + /// + /// + /// + /// public static ValueMappingEstimator ValueMap( this TransformsCatalog.ConversionTransforms catalog, IDataView lookupMap, string keyColumn, string valueColumn, params (string outputColumnName, string inputColumnName)[] columns) diff --git a/src/Microsoft.ML.Data/Transforms/ValueMapping.cs b/src/Microsoft.ML.Data/Transforms/ValueMapping.cs index 70b46fad01..3d0e799a39 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueMapping.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueMapping.cs @@ -33,10 +33,7 @@ namespace Microsoft.ML.Transforms.Conversions { - /// - /// The ValueMappingEstimator is a 1-1 mapping from a key to value. This particular class load the mappings from an . - /// This gives user the flexibility to load the mapping from file instead of using IEnumerable in - /// + /// public class ValueMappingEstimator : TrivialEstimator { private readonly (string outputColumnName, string inputColumnName)[] _columns; @@ -87,11 +84,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) } } - /// - /// The ValueMappingEstimator is a 1-1 mapping from a key to value. The key type and value type are specified - /// through TKey and TValue. TKey is always a scalar. TValue can be either a scalar or an array (array is only possible when input is scalar). - /// The mapping is specified, not trained by providing a list of keys and a list of values. - /// + /// /// Specifies the key type. /// Specifies the value type. public sealed class ValueMappingEstimator : ValueMappingEstimator @@ -302,6 +295,7 @@ internal static IDataView CreateDataView(IHostEnvironment env, } } + /// public class ValueMappingTransformer : OneToOneTransformerBase { internal const string Summary = "Maps text values columns to new columns using a map dataset."; diff --git a/src/Microsoft.ML.Data/Transforms/doc.xml b/src/Microsoft.ML.Data/Transforms/doc.xml index a743ee06a4..47671617b9 100644 --- a/src/Microsoft.ML.Data/Transforms/doc.xml +++ b/src/Microsoft.ML.Data/Transforms/doc.xml @@ -63,6 +63,32 @@ but can be defined through other means: either with the mapping defined directly on the command line, or as loaded from an external file. + + + The ValueMappingEstimator is a 1-1 mapping from a key to value. + + + Given a set of keys and values, the ValueMappingEstimator builds up a dictionary so that when given a specific key it will return a + specific value. The ValueMappingEstimator supports keys and values of different to support different data types. + Examples for using a ValueMappingEstimator are: + + + Converting a string value to a string value, this can be useful for grouping (i.e. 'cat', 'dog', 'horse' maps to 'mammals') + + + Converting a string value to a integer value (i.e. converting the text description like quality to an numeric where 'good' maps to 1, 'poor' maps to 0 + + + + Converting a integer value to a string value and have the string value represented as a + (i.e. convert zip codes to a state string value, which will generate a unique integer value that can be used as a label. + + + + Values can be repeated to allow for multiple keys to map to the same value, however keys can not be repeated. The mapping between keys and values + can be specified either through lists, where the key list and value list must be the same size or can be done through an . + + diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 90f8417d1f..b341486390 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -118,6 +118,23 @@ public static IEnumerable GetTopicsData() return data; } + public class SampleTemperatureData + { + public DateTime Date {get; set; } + public float Temperature { get; set; } + } + + public static IEnumerable GetSampleTemperatureData() + { + var data = new List(); + data.Add(new SampleTemperatureData { Date = new DateTime(2012,1,1), Temperature = 39.0F }); + data.Add(new SampleTemperatureData { Date = new DateTime(2012,1,2), Temperature = 82.0F }); + data.Add(new SampleTemperatureData { Date = new DateTime(2012,1,3), Temperature = 75.0F }); + data.Add(new SampleTemperatureData { Date = new DateTime(2012,1,4), Temperature = 67.0F }); + data.Add(new SampleTemperatureData { Date = new DateTime(2012,1,5), Temperature = 75.0F }); + return data; + } + /// /// Represents the column of the infertility dataset. /// @@ -168,7 +185,7 @@ public static IEnumerable GetInfertData() data.Add(new SampleInfertData { RowNum = 2, - Education = "0-5yrs", + Education = "12+yrs", Age = 39, Parity = 6, Induced = 2,