From 585061ec31f84051bafa662909b4a061a22595d7 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Fri, 31 Mar 2017 20:51:21 -0400
Subject: [PATCH 01/51] Put the targets into the gene evaluate(..).

---
 mep/genetics/gene.py            | 48 ++++++++++++++++++++++++++-------
 tests/mep/genetics/test_gene.py | 16 +++++++----
 2 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py
index a00c8bd..14ce831 100644
--- a/mep/genetics/gene.py
+++ b/mep/genetics/gene.py
@@ -10,7 +10,7 @@ class Gene(object):
     __metaclass__ = ABCMeta
 
     @abstractmethod
-    def evaluate(self, gene_index, eval_matrix, data_matrix, constants):
+    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
         """
         This method will modify the eval_matrix for this gene index for each example in the data_matrix.
 
@@ -24,7 +24,10 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants):
         :type data_matrix: np.matrix
         :param constants: the constants associated with this chromosome
         :type constants: list
-        :return: nothing; modifies the eval_matrix
+        :param targets: the targets; equal to the number of examples (n)
+        :type targets: list
+        :return: error (sum of error across the examples); modifies the eval_matrix
+        :rtype: float
         """
 
 
@@ -49,7 +52,7 @@ def __init__(self, index, is_feature=True):
         self.index = index
         self.is_feature = is_feature
 
-    def evaluate(self, gene_index, eval_matrix, data_matrix, constants):
+    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
         """
         This method will modify the eval_matrix for this gene index for each example in the data_matrix.
 
@@ -64,18 +67,32 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants):
         :type data_matrix: np.matrix
         :param constants: the constants associated with this chromosome
         :type constants: list
-        :return: nothing; modifies the eval_matrix
+        :param targets: the targets; equal to the number of examples (n)
+        :type targets: list
+        :return: error (sum of error); modifies the eval_matrix
+        :rtype: float
         """
+        # TODO: Move common logic up
+        # TODO: Handle classification as well as regression
+
         # go through and set the data
         num_examples = eval_matrix.shape[1]
+        sum_of_errors = 0.
         for example_index in range(0, num_examples):
             # each column is one example in the data matrix (i.e. one feature vector)
             # if we are a feature variable then we look at the corresponding feature in the feature vector for this
             # example; otherwise (as a constant) we just go to that (independent of the example we are in)
             if self.is_feature:
-                eval_matrix[gene_index, example_index] = data_matrix[example_index, self.index]
+                value = data_matrix[example_index, self.index]
             else:
-                eval_matrix[gene_index, example_index] = constants[self.index]
+                value = constants[self.index]
+            # calculate error
+            sum_of_errors += abs(targets[example_index] - value)
+
+            # set it in the eval matrix
+            eval_matrix[gene_index, example_index] = value
+
+        return sum_of_errors
 
     def __str__(self):
         return "VariableGene({}, is_feature={})".format(self.index, self.is_feature)
@@ -112,7 +129,7 @@ def __init__(self, operation, address1, address2):
         self.address1 = address1
         self.address2 = address2
 
-    def evaluate(self, gene_index, eval_matrix, data_matrix, constants):
+    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
         """
         This method will modify the eval_matrix for this gene index for each example in the data_matrix.
 
@@ -126,16 +143,27 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants):
         :type data_matrix: np.matrix
         :param constants: the constants associated with this chromosome
         :type constants: list
-        :return: nothing; modifies the eval_matrix
+        :param targets: the targets; equal to the number of examples (n)
+        :type targets: list
+        :return: error (sum of error); modifies the eval_matrix
+        :rtype: float
+
         """
         # go through and set the data
         num_examples = eval_matrix.shape[1]
+        sum_of_errors = 0.
         for example_index in range(0, num_examples):
             # each column is one example in the data matrix (i.e. one feature vector)
 
             # TODO: Catch errors; in particular division can be a problem
-            eval_matrix[gene_index, example_index] = self.operation(eval_matrix[self.address1][example_index],
-                                                                    eval_matrix[self.address2][example_index])
+            value = self.operation(eval_matrix[self.address1][example_index],
+                                   eval_matrix[self.address2][example_index])
+            # set it in the eval matrix
+            eval_matrix[gene_index, example_index] = value
+
+            sum_of_errors += abs(targets[example_index] - value)
+
+        return sum_of_errors
 
     def __str__(self):
         return "OperatorGene({}, {}, {})".format(self.operation, self.address1, self.address2)
diff --git a/tests/mep/genetics/test_gene.py b/tests/mep/genetics/test_gene.py
index 40aa521..7b62d12 100644
--- a/tests/mep/genetics/test_gene.py
+++ b/tests/mep/genetics/test_gene.py
@@ -25,14 +25,16 @@ def test_basic_constant(self):
         constants = [1., 2.]
         eval_matrix = np.zeros((num_genes, num_examples))
         data_matrix = np.zeros((num_examples, num_features))
+        targets = [0] * num_examples
 
         # expected; only one gene and it is going to be using the first constant;
         gene_index = 0
         expected_eval_matrix = np.matrix([[constants[constant_index], constants[constant_index]]])
 
         # run the evaluate
-        gene.evaluate(gene_index, eval_matrix, data_matrix, constants)
+        error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets)
         self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix))
+        self.assertEquals((1. - 0) + (1. - 0), error)
 
     def test_basic_feature_gene(self):
         """
@@ -51,6 +53,7 @@ def test_basic_feature_gene(self):
         constants = [1., 2.]
         eval_matrix = np.zeros((num_genes, num_examples))
         data_matrix = np.zeros((num_examples, num_features))
+        targets = [0] * num_examples
 
         # set the data matrix for the feature that we care about
         data_matrix[0, feature_index] = 5.
@@ -61,8 +64,9 @@ def test_basic_feature_gene(self):
         expected_eval_matrix = np.matrix([[data_matrix[0, feature_index], data_matrix[1, feature_index]]])
 
         # run the evaluate
-        gene.evaluate(gene_index, eval_matrix, data_matrix, constants)
+        error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets)
         self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix))
+        self.assertEquals((5. - 0.) + (7. - 0.), error)
 
     def test_constant_and_feature_gene(self):
         """
@@ -83,6 +87,7 @@ def test_constant_and_feature_gene(self):
         constants = [1., 2.]
         eval_matrix = np.zeros((num_genes, num_examples))
         data_matrix = np.zeros((num_examples, num_features))
+        targets = [0] * num_examples
 
         # set the data matrix for the feature that we care about
         data_matrix[0, feature_index] = 5.
@@ -93,8 +98,8 @@ def test_constant_and_feature_gene(self):
                                           [constants[constant_index], constants[constant_index]]])
 
         # run the evaluate
-        feature_gene.evaluate(0, eval_matrix, data_matrix, constants)
-        constant_gene.evaluate(1, eval_matrix, data_matrix, constants)
+        feature_error = feature_gene.evaluate(0, eval_matrix, data_matrix, constants, targets)
+        constant_error = constant_gene.evaluate(1, eval_matrix, data_matrix, constants, targets)
         self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix))
 
     def test_operator_gene_basic(self):
@@ -111,6 +116,7 @@ def test_operator_gene_basic(self):
         num_examples = 1
         num_genes = 2
         num_features = 3
+        targets = [0] * num_examples
 
         # create
         constants = []
@@ -125,5 +131,5 @@ def test_operator_gene_basic(self):
                                           [4]])
 
         # run the evaluate
-        gene.evaluate(1, eval_matrix, data_matrix, constants)
+        error = gene.evaluate(1, eval_matrix, data_matrix, constants, targets)
         self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix))

From 65b519e191141e43ed141a9ba219d652c07ea680 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Fri, 31 Mar 2017 21:48:16 -0400
Subject: [PATCH 02/51] Chromosome evaluate method.

---
 mep/genetics/chromosome.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index 6d58da2..3d9c6aa 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -1,4 +1,5 @@
 import logging
+import numpy as np
 from mep.genetics.gene import Gene, VariableGene, OperatorGene
 from random import random, randint, choice
 
@@ -29,7 +30,9 @@ def __init__(self, genes, constants):
         self.genes = genes
         self.constants = constants
 
-        # TODO: track the best fitness and the associated best gene seen so far
+        # track the best found error and the associated gene
+        self.error = float('inf')
+        self.best_gene_index = -1
 
     @classmethod
     def generate_random_chromosome(cls, num_constants, constants_min, constants_max, constants_prob,
@@ -86,6 +89,25 @@ def generate_random_chromosome(cls, num_constants, constants_min, constants_max,
         # construct and return the chromosome
         return Chromosome(genes, constants)
 
+    def evaluate(self, data_matrix, targets):
+        """
+        Evaluate the various genes.
+
+        :param data_matrix: the data matrix; rows are feature vectors; comes from the data set; it is (n, m) where "n"
+        is the number of examples and "m" is the number of features.
+        :type data_matrix: np.matrix
+        :param targets: the targets; equal to the number of examples (n)
+        :type targets: list
+        """
+        num_examples = data_matrix.shape[0]
+        eval_matrix = np.zeros((len(self.genes), num_examples))
+        for gene_index, gene in enumerate(self.genes):
+            # compute the error for this gene; if it is the best we have found then update
+            error = gene.evaluate(gene_index, eval_matrix, data_matrix, self.constants, targets)
+            if error < self.error:
+                self.error = error
+                self.best_gene_index = gene_index
+
     def __str__(self):
         return "Chromosome({}, {})".format(self.genes, self.constants)
 

From f9fcd4a39b2055412201f998e348b09a45f6c084 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 1 Apr 2017 12:52:55 -0400
Subject: [PATCH 03/51] This adds the test for the chromosome evaluate method.

---
 tests/mep/genetics/test_chromosome.py | 35 ++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py
index 8cecf9d..628c803 100644
--- a/tests/mep/genetics/test_chromosome.py
+++ b/tests/mep/genetics/test_chromosome.py
@@ -1,10 +1,26 @@
 import unittest
 import random
-from mep.genetics.gene import VariableGene, OperatorGene
+from mep.genetics.gene import VariableGene, OperatorGene, Gene
 from mep.genetics.chromosome import Chromosome
 import numpy as np
 
 
+class MockedGene(Gene):
+    def __init__(self, error_to_return):
+        """
+        Initialize.
+        :param error_to_return: what to return in the evaluate
+        :type error_to_return: float
+        """
+        self.error_to_return = error_to_return
+
+    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
+        """
+        Simple mocked version.
+        """
+        return self.error_to_return
+
+
 class TestChromosome(unittest.TestCase):
     """
     Tests for the chromosome.
@@ -38,3 +54,20 @@ def test_basic_random_construction(self):
 
         # verify constant
         self.assertAlmostEquals(8.599796663725433, chromosome.constants[0])
+
+    def test_evaluate(self):
+        """
+        Basic test of the evaluate method.
+        """
+        # construct mocked genes
+        genes = [MockedGene(10), MockedGene(1)]
+
+        # construct chromosome
+        chromosome = Chromosome(genes, constants=[1, 2, 3])
+
+        # evaluate
+        chromosome.evaluate(np.zeros((2, 2)), targets=[20, 30])
+
+        # confirm the genes
+        self.assertEqual(genes[1], genes[chromosome.best_gene_index])
+        self.assertEqual(genes[1].error_to_return, chromosome.error)

From 8ecb9629a6797d7f2ce7f90a0f2c3ff42221da31 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 1 Apr 2017 13:57:34 -0400
Subject: [PATCH 04/51] Initialize mechanism for population and sorting
 chromosomes.

---
 mep/genetics/chromosome.py            | 10 ++++
 mep/genetics/population.py            | 67 ++++++++++++++++++++++++++-
 tests/mep/genetics/test_chromosome.py | 16 +++++++
 3 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index 3d9c6aa..0fe8be5 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -113,3 +113,13 @@ def __str__(self):
 
     def __repr__(self):
         return self.__str__()
+
+    def __lt__(self, other):
+        """
+        Less-than used by sort(...)
+
+        :param other:
+        :type other: Chromosome
+        :return:
+        """
+        return self.error < other.error
\ No newline at end of file
diff --git a/mep/genetics/population.py b/mep/genetics/population.py
index 55ae601..dcfd097 100644
--- a/mep/genetics/population.py
+++ b/mep/genetics/population.py
@@ -1,4 +1,69 @@
+from mep.genetics.chromosome import Chromosome
+
+
 class Population(object):
     """
     A collection of chromosomes.
-    """
\ No newline at end of file
+    """
+
+    def __init__(self, data_matrix, targets, num_constants, constants_min, constants_max, constants_prob,
+                 feature_variable_prob, num_genes, num_chromosomes, operators_prob):
+        """
+        Build a randomly constructed chromosome.
+
+        :param data_matrix: the data matrix; rows are feature vectors; comes from the data set; it is (n, m) where "n"
+        is the number of examples and "m" is the number of features.
+        :type data_matrix: np.matrix
+        :param targets: the targets; equal to the number of examples (n)
+        :type targets: list
+        :param num_constants: how many constants to have
+        :type num_constants: int
+        :param constants_min: the min range of the constants
+        :type constants_min: float
+        :param constants_max: the max range of the constants
+        :type constants_max: float
+        :param constants_prob: the probability that a given gene is a constant
+        :type constants_prob: float
+        :param feature_variable_prob: the probability that a given gene is a feature variable
+        :type feature_variable_prob: float
+        :param num_genes: how many genes
+        :type num_genes: int
+        :param num_chromosomes: how many chromosomes to use
+        :type num_chromosomes: int
+        :param operators_prob: the probability that a given gene is an operator
+        :type operators_prob: float
+        """
+        # set the variables
+        self.data_matrix = data_matrix
+        self.targets = targets
+        self.num_constants = num_constants
+        self.constants_min = constants_min
+        self.constants_max = constants_max
+        self.constants_prob = constants_prob
+        self.feature_variable_prob = feature_variable_prob
+        self.num_feature_variables = self.data_matrix.shape[1]
+        self.num_genes = num_genes
+        self.num_chromosomes = num_chromosomes
+        self.operators_prob = operators_prob
+
+        # the chromosomes
+        self.chromosomes = None
+
+    def initialize(self):
+        """
+        Initialize the random chromosomes.
+        """
+        # generate the random chromosomes
+        self.chromosomes = [Chromosome.generate_random_chromosome(self.num_constants, self.constants_min,
+                                                                  self.constants_max, self.constants_prob,
+                                                                  self.feature_variable_prob,
+                                                                  self.num_feature_variables, self.num_genes,
+                                                                  self.operators_prob)
+                            for _ in range(self.num_chromosomes)]
+
+        # evaluate
+        # TODO: this could be done in parallel
+        for chromosome in self.chromosomes:
+            chromosome.evaluate(self.data_matrix, self.targets)
+
+        # TODO: sort them?
diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py
index 628c803..f498274 100644
--- a/tests/mep/genetics/test_chromosome.py
+++ b/tests/mep/genetics/test_chromosome.py
@@ -71,3 +71,19 @@ def test_evaluate(self):
         # confirm the genes
         self.assertEqual(genes[1], genes[chromosome.best_gene_index])
         self.assertEqual(genes[1].error_to_return, chromosome.error)
+
+    def test_sort(self):
+        """
+        Test the sort mechanism.
+        """
+        # construct the chromosomes and test sorting them (by error)
+        min_chromosome, mid_chromosome, max_chromosome = Chromosome([], []), Chromosome([], []), Chromosome([], [])
+        min_chromosome.error = 1
+        mid_chromosome.error = 2
+        max_chromosome.error = 3
+        chromosomes = [mid_chromosome, max_chromosome, min_chromosome]
+        expected_chromosomes = [min_chromosome, mid_chromosome, max_chromosome]
+
+        # do the sort and verify
+        chromosomes.sort()
+        self.assertEqual(expected_chromosomes, chromosomes)

From 7dd2cc709004854452827c9896a708bcbb0bb8ae Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 1 Apr 2017 15:00:28 -0400
Subject: [PATCH 05/51] Random tournament selection.

---
 mep/genetics/population.py            | 27 +++++++++++++-
 tests/mep/genetics/test_population.py | 51 +++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 tests/mep/genetics/test_population.py

diff --git a/mep/genetics/population.py b/mep/genetics/population.py
index dcfd097..073fd7d 100644
--- a/mep/genetics/population.py
+++ b/mep/genetics/population.py
@@ -1,4 +1,5 @@
 from mep.genetics.chromosome import Chromosome
+import random
 
 
 class Population(object):
@@ -66,4 +67,28 @@ def initialize(self):
         for chromosome in self.chromosomes:
             chromosome.evaluate(self.data_matrix, self.targets)
 
-        # TODO: sort them?
+        # sort the chromosomes
+        self.chromosomes.sort()
+
+    def random_tournament_selection(self, tournament_size):
+        """
+        Randomly select (tournament_size) chromosomes and return the best one.
+        :param tournament_size: the size of the tournament
+        :type tournament_size: int
+        :return: the
+        """
+        # TODO: Check for bad tournament size
+        best_chromosome = None
+        for _ in range(tournament_size):
+            chromosome = random.choice(self.chromosomes)
+            if best_chromosome is None or chromosome.error < best_chromosome.error:
+                best_chromosome = chromosome
+
+        return best_chromosome
+
+    def next_generation(self):
+        """
+        Advance to the next generation.
+        """
+        # TODO: populate
+
diff --git a/tests/mep/genetics/test_population.py b/tests/mep/genetics/test_population.py
new file mode 100644
index 0000000..29b235f
--- /dev/null
+++ b/tests/mep/genetics/test_population.py
@@ -0,0 +1,51 @@
+import unittest
+import random
+import numpy as np
+from mep.genetics.population import Population
+from mep.genetics.chromosome import Chromosome
+
+class TestPopulation(unittest.TestCase):
+    """
+    Test the Population class.
+    """
+
+    def test_random_tournament_selection(self):
+        """
+        Test the random_tournament_selection(...)
+        """
+        # make it so this repeatable
+        random.seed(0)
+
+        # construct the population
+        num_examples = 5
+        num_features = 7
+        population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1, 1)
+
+        # confirm the number of feature variables (not critical for this test)
+        self.assertEqual(num_features, population.num_feature_variables)
+
+        # test the tournament selection; not that it randomly chooses the not as good chromosome
+        min_chromosome, max_chromosome = Chromosome([], []), Chromosome([], [])
+        min_chromosome.error = 1
+        max_chromosome.error = 2
+        population.chromosomes = [min_chromosome, max_chromosome]
+        self.assertEqual(max_chromosome, population.random_tournament_selection(1))
+
+    def test_larger_random_tournament_selection(self):
+        """
+        Test the random_tournament_selection(...)
+        """
+        # make it so this repeatable
+        random.seed(0)
+
+        # construct the population
+        num_examples = 5
+        num_features = 7
+        population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1, 1)
+
+        # test the tournament selection; not that it randomly chooses the not as good chromosome
+        min_chromosome, max_chromosome = Chromosome([], []), Chromosome([], [])
+        min_chromosome.error = 1
+        max_chromosome.error = 2
+        population.chromosomes = [min_chromosome, max_chromosome]
+        self.assertEqual(min_chromosome, population.random_tournament_selection(10))
\ No newline at end of file

From f110899d2ecc25bb72c7b6a0a8ab5afbffbef5cf Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Thu, 6 Apr 2017 21:44:06 -0400
Subject: [PATCH 06/51] Chromosome crossover logic.

---
 mep/genetics/population.py | 49 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/mep/genetics/population.py b/mep/genetics/population.py
index 073fd7d..be477a7 100644
--- a/mep/genetics/population.py
+++ b/mep/genetics/population.py
@@ -47,6 +47,10 @@ def __init__(self, data_matrix, targets, num_constants, constants_min, constants
         self.num_chromosomes = num_chromosomes
         self.operators_prob = operators_prob
 
+        # TODO: take in
+        self.crossover_prob = 0.9
+        self.mutation_prob = 0.1
+
         # the chromosomes
         self.chromosomes = None
 
@@ -86,9 +90,52 @@ def random_tournament_selection(self, tournament_size):
 
         return best_chromosome
 
+    def one_cut_point_crossover(self, parent1, parent2):
+        """
+        Construct two offspring chromosomes from the parents. We determine the crossover point so that we
+        take the first genes up to that point from parent1/parent2 and then we switch.
+        :param parent1: one parent chromosome
+        :type parent1: Chromosome
+        :param parent2: the other parent chromosome
+        :type parent2: Chromosome
+        :return: two offsprings
+        :rtype: (Chromosome, Chromosome)
+        """
+        # construct the genes and constants for the offsprings from the parents
+        offspring1 = Chromosome([], [])
+        offspring2 = Chromosome([], [])
+
+        # determine the crossover point;
+        cutting_point = random.randint(0, self.num_genes)
+
+        # copy over the genes; first half and now the 2nd half (from the other chromosome)
+        offspring1.genes = parent1.genes[:cutting_point] + parent2.genes[cutting_point:]
+        offspring2.genes = parent2.genes[:cutting_point] + parent1.genes[cutting_point:]
+
+        # same thing with the constants
+        cutting_point = random.randint(0, self.num_constants)
+
+        # copy over the constants; first half and now the 2nd half
+        offspring1.constants = parent1.constants[:cutting_point] + parent2.constants[cutting_point:]
+        offspring2.constants = parent2.constants[:cutting_point] + parent1.constants[cutting_point:]
+
+        return offspring1, offspring2
+
     def next_generation(self):
         """
         Advance to the next generation.
         """
-        # TODO: populate
+        for _ in range(0, len(self.chromosomes), 2):
+            # select parents
+            chromosome1 = self.random_tournament_selection(2)
+            chromosome2 = self.random_tournament_selection(2)
+
+            # crossover
+            if random.random() < self.crossover_prob:
+                # TODO: do crossover
+                pass
+            else:
+                # offspring are copies of the parents
+                pass
+            # TODO: fill in
 

From 846e47576410a3bc121728cbb28584e24e773b78 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 8 Apr 2017 18:38:27 -0400
Subject: [PATCH 07/51] Mutation method.

---
 mep/genetics/chromosome.py | 51 +++++++++++++++++++++++++++++++++++++-
 mep/genetics/population.py | 23 ++++++++++++++---
 2 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index 0fe8be5..34329b3 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -108,6 +108,55 @@ def evaluate(self, data_matrix, targets):
                 self.error = error
                 self.best_gene_index = gene_index
 
+    def mutate(self, gene_mutation_prob, num_constants, constants_min, constants_max, constants_prob,
+               feature_variable_prob, num_feature_variables, num_genes, operators_prob):
+        """
+        Mutate the chromosome. Works by going through and randomly mutating genes and then constants.
+        :param gene_mutation_prob: probability to mutate a given gene
+        :type gene_mutation_prob: float
+        :param num_constants: how many constants to have
+        :type num_constants: int
+        :param constants_min: the min range of the constants
+        :type constants_min: float
+        :param constants_max: the max range of the constants
+        :type constants_max: float
+        :param constants_prob: the probability that a given gene is a constant
+        :type constants_prob: float
+        :param feature_variable_prob: the probability that a given gene is a feature variable
+        :type feature_variable_prob: float
+        :param num_feature_variables: how many features we have
+        :type num_feature_variables: int
+        :param num_genes: how many genes
+        :type num_genes: int
+        :param operators_prob: the probability that a given gene is an operator
+        :type operators_prob: float
+        :return: nothing
+        """
+        # the probabilities are all the same for generating a random chromosome; therefore let's construct
+        # a random chromosome and then (effectively) do a uniform crossover where a "mutate" means that we
+        # take the new chromosome's gene/constants
+        # TODO: Should we have these variables set in the chromosome then?
+        # TODO: maybe just pass in this random chromosome then?
+        random_chromosome = Chromosome.generate_random_chromosome(num_constants, constants_min,
+                                                                  constants_max, constants_prob,
+                                                                  feature_variable_prob,
+                                                                  num_feature_variables, num_genes,
+                                                                  operators_prob)
+
+        # go through mutating genes;
+        for gene_index in range(len(self.genes)):
+            # decide if we are going to mutate this gene
+            if random() <= gene_mutation_prob:
+                # mutated; therefore grab the corresponding gene from the random chromosome
+                self.genes[gene_index] = random_chromosome.genes[gene_index]
+
+        # go through mutating constants;
+        for constants_index in range(len(self.constants)):
+            # decide if we are going to mutate this gene
+            if random() <= gene_mutation_prob:
+                # mutated; therefore grab the corresponding constant from the random chromosome
+                self.constants[constants_index] = random_chromosome.constants[constants_index]
+
     def __str__(self):
         return "Chromosome({}, {})".format(self.genes, self.constants)
 
@@ -122,4 +171,4 @@ def __lt__(self, other):
         :type other: Chromosome
         :return:
         """
-        return self.error < other.error
\ No newline at end of file
+        return self.error < other.error
diff --git a/mep/genetics/population.py b/mep/genetics/population.py
index be477a7..fc73366 100644
--- a/mep/genetics/population.py
+++ b/mep/genetics/population.py
@@ -1,5 +1,6 @@
 from mep.genetics.chromosome import Chromosome
 import random
+import copy
 
 
 class Population(object):
@@ -108,6 +109,7 @@ def one_cut_point_crossover(self, parent1, parent2):
         # determine the crossover point;
         cutting_point = random.randint(0, self.num_genes)
 
+        # TODO: copy the genes
         # copy over the genes; first half and now the 2nd half (from the other chromosome)
         offspring1.genes = parent1.genes[:cutting_point] + parent2.genes[cutting_point:]
         offspring2.genes = parent2.genes[:cutting_point] + parent1.genes[cutting_point:]
@@ -132,10 +134,23 @@ def next_generation(self):
 
             # crossover
             if random.random() < self.crossover_prob:
-                # TODO: do crossover
-                pass
+                offspring1, offspring2 = self.one_cut_point_crossover(chromosome1, chromosome2)
             else:
                 # offspring are copies of the parents
-                pass
-            # TODO: fill in
+                offspring1 = copy.copy(chromosome1)
+                offspring2 = copy.copy(chromosome2)
+
+            # mutate (potentially) offspring
+            offspring1.mutate(self.mutation_prob, self.num_constants, self.constants_min,
+                              self.constants_max, self.constants_prob,
+                              self.feature_variable_prob,
+                              self.num_feature_variables, self.num_genes,
+                              self.operators_prob)
+            # TODO: evaluate
+            offspring2.mutate(self.mutation_prob, self.num_constants, self.constants_min,
+                              self.constants_max, self.constants_prob,
+                              self.feature_variable_prob,
+                              self.num_feature_variables, self.num_genes,
+                              self.operators_prob)
 
+            # TODO: fill in

From 7059d0f2323720b328f4fa22dcd6d7f54285d73d Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 8 Apr 2017 19:08:42 -0400
Subject: [PATCH 08/51] Offsping evaluation and insertion.

---
 mep/genetics/population.py | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/mep/genetics/population.py b/mep/genetics/population.py
index fc73366..da5e3b0 100644
--- a/mep/genetics/population.py
+++ b/mep/genetics/population.py
@@ -140,17 +140,41 @@ def next_generation(self):
                 offspring1 = copy.copy(chromosome1)
                 offspring2 = copy.copy(chromosome2)
 
-            # mutate (potentially) offspring
+            # TODO: we could consolidate the offspring code into one method and just call it twice
+            # mutate (potentially) offspring; calculate error
             offspring1.mutate(self.mutation_prob, self.num_constants, self.constants_min,
                               self.constants_max, self.constants_prob,
                               self.feature_variable_prob,
                               self.num_feature_variables, self.num_genes,
                               self.operators_prob)
-            # TODO: evaluate
+            offspring1.evaluate(self.data_matrix, self.targets)
             offspring2.mutate(self.mutation_prob, self.num_constants, self.constants_min,
                               self.constants_max, self.constants_prob,
                               self.feature_variable_prob,
                               self.num_feature_variables, self.num_genes,
                               self.operators_prob)
+            offspring2.evaluate(self.data_matrix, self.targets)
+
+            # replace the worst chromosome in the population; note that the chromosomes start in a sorted
+            # order so the one at the end has the highest error; we now insert the offspring into the list
+            # at their error level -- i.e. keep it in sorted order
+            # TODO: We should be able to do this in one loop but let's do each offspring separately as it is clearer
+            insert_index = -1
+            for chromosome_index, chromosome in enumerate(self.chromosomes):
+                if offspring1.error < chromosome.error:
+                    insert_index = chromosome_index
+                    break
+            # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index
+            if insert_index > -1:
+                self.chromosomes.insert(insert_index, offspring1)
+
+            # now the other offspring
+            insert_index = -1
+            for chromosome_index, chromosome in enumerate(self.chromosomes):
+                if offspring2.error < chromosome.error:
+                    insert_index = chromosome_index
+                    break
+            # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index
+            if insert_index > -1:
+                self.chromosomes.insert(insert_index, offspring2)
 
-            # TODO: fill in

From c5b124d14af835289662a0e07067b85776db74c1 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sun, 9 Apr 2017 09:35:16 -0400
Subject: [PATCH 09/51] Running the population. Messy prints but working.

---
 mep/config/config.json     |  2 +-
 mep/dataset.py             |  8 ++++++--
 mep/genetics/chromosome.py | 32 ++++++++++++++++++++++++++++++++
 mep/genetics/gene.py       | 11 +++++++++++
 mep/genetics/population.py |  6 ++----
 mep/main.py                | 38 +++++++++++++++++++++++++++++++++-----
 6 files changed, 85 insertions(+), 12 deletions(-)

diff --git a/mep/config/config.json b/mep/config/config.json
index d1dea47..efb51db 100644
--- a/mep/config/config.json
+++ b/mep/config/config.json
@@ -8,7 +8,7 @@
   "mutation_probability": 0.1,
   "crossover_probability": 0.9,
 
-  "variables_probability": 0.4,
+  "feature_variables_probability": 0.4,
   "operators_probability": 0.5,
 
   "num_constants": 3,
diff --git a/mep/dataset.py b/mep/dataset.py
index 850d93a..4f09241 100644
--- a/mep/dataset.py
+++ b/mep/dataset.py
@@ -13,5 +13,9 @@ def __init__(self, filename):
         :param filename: the filename (full path to CSV) of the data
         :type filename: str
         """
-        # TODO: What about supporting other file formats?
-        self.data = pd.read_csv(filename)
\ No newline at end of file
+        # we assume this in the format of feature cols and then target
+        self.data = pd.read_csv(filename)
+
+        # extract out data matrix and target
+        self.target = self.data.target.values
+        self.data_matrix = self.data.drop("target", 1).values
diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index 34329b3..ca96f4f 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -160,6 +160,38 @@ def mutate(self, gene_mutation_prob, num_constants, constants_min, constants_max
     def __str__(self):
         return "Chromosome({}, {})".format(self.genes, self.constants)
 
+    def pretty_string(self, stop_at_best=True):
+        """
+        Output in a program like format. First show the constants. Then one line per gene.
+        :return: the program
+        :rtype: str
+        """
+        # first we show the constants
+        program = "CONSTANTS = [{}]\n".format(",".join([str(c) for c in self.constants]))
+
+        # now show each gene on a separate line
+        for gene_index, gene in enumerate(self.genes):
+            gene_str = gene.__str__()
+            if type(gene) == VariableGene:
+                gene_str = gene.pretty_string()
+            elif type(gene) == OperatorGene:
+                # TODO: Push this logic into the gene; the only tricky part is the operator lambda; we will probably
+                # need to replace the lambda with a larger object
+                if gene.operation == Chromosome.operator_lambdas[0]:
+                    op = "+"
+                elif gene.operation == Chromosome.operator_lambdas[1]:
+                    op = "-"
+                elif gene.operation == Chromosome.operator_lambdas[2]:
+                    op = "*"
+                gene_str = "PROGRAM[{}] {} PROGRAM[{}]".format(gene.address1, op, gene.address2)
+            program += "{}:{}\n".format(gene_index, gene_str)
+
+            if self.best_gene_index == gene_index and stop_at_best:
+                return program
+
+        # if we want to print the full program
+        return program
+
     def __repr__(self):
         return self.__str__()
 
diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py
index 14ce831..b1adc0a 100644
--- a/mep/genetics/gene.py
+++ b/mep/genetics/gene.py
@@ -97,6 +97,17 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
     def __str__(self):
         return "VariableGene({}, is_feature={})".format(self.index, self.is_feature)
 
+    def pretty_string(self):
+        """
+        Pretty program string version.
+        :return: string version
+        :rtype: str
+        """
+        if self.is_feature:
+            return "FEATURES[{}]".format(self.index)
+        else:
+            return "CONSTANTS[{}]".format(self.index)
+
     def __repr__(self):
         return self.__str__()
 
diff --git a/mep/genetics/population.py b/mep/genetics/population.py
index da5e3b0..2aea971 100644
--- a/mep/genetics/population.py
+++ b/mep/genetics/population.py
@@ -8,7 +8,7 @@ class Population(object):
     A collection of chromosomes.
     """
 
-    def __init__(self, data_matrix, targets, num_constants, constants_min, constants_max, constants_prob,
+    def __init__(self, data_matrix, targets, num_constants, constants_min, constants_max,
                  feature_variable_prob, num_genes, num_chromosomes, operators_prob):
         """
         Build a randomly constructed chromosome.
@@ -24,8 +24,6 @@ def __init__(self, data_matrix, targets, num_constants, constants_min, constants
         :type constants_min: float
         :param constants_max: the max range of the constants
         :type constants_max: float
-        :param constants_prob: the probability that a given gene is a constant
-        :type constants_prob: float
         :param feature_variable_prob: the probability that a given gene is a feature variable
         :type feature_variable_prob: float
         :param num_genes: how many genes
@@ -41,7 +39,7 @@ def __init__(self, data_matrix, targets, num_constants, constants_min, constants
         self.num_constants = num_constants
         self.constants_min = constants_min
         self.constants_max = constants_max
-        self.constants_prob = constants_prob
+        self.constants_prob = 1. - operators_prob - feature_variable_prob
         self.feature_variable_prob = feature_variable_prob
         self.num_feature_variables = self.data_matrix.shape[1]
         self.num_genes = num_genes
diff --git a/mep/main.py b/mep/main.py
index b42be36..e7abdd6 100644
--- a/mep/main.py
+++ b/mep/main.py
@@ -3,14 +3,20 @@
 import json
 import logging
 import os
+from dataset import DataSet
+from mep.genetics.population import Population
 
 if __name__ == "__main__":
-    # TODO: Get the data file
+    # TODO: error check usage
+
+    # get the data file
+    data_set_name = sys.argv[1]
+    data_set = DataSet(data_set_name)
 
     # read config file
-    # TODO: Possible config file override on comand line
-    with open("mep/config/config.json") as data_file:
-        config = json.load(data_file)
+    # TODO: Possible config file override on command line
+    with open("mep/config/config.json") as config_file:
+        config = json.load(config_file)
 
     # construct output logs dir if it doesn't exist
     output_logs_dir = config["output_logs"]
@@ -25,5 +31,27 @@
     logger = logging.getLogger("main")
     logger.info("Starting up...")
 
-    
+    # construct a population and run it for the number of generations specified
+    population = Population(data_set.data_matrix, data_set.target, int(config["num_constants"]),
+                            float(config["constants_min"]), float(config["constants_max"]),
+                            float(config["feature_variables_probability"]),
+                            int(config["code_length"]), int(config["population_size"]),
+                            float(config["operators_probability"]))
+    population.initialize()
+
+    # iterate through the generations
+    best_chromosome = None
+    for generation in range(int(config["num_generations"])):
+        best_chromosome = population.chromosomes[0]
+        logger.debug("Generation number {} best chromosome error {}".format(generation,
+                                                                            best_chromosome.error))
+        print("Generation number {} best chromosome error {}".format(generation,
+                                                                     best_chromosome.error))
+        if best_chromosome.error == 0:
+            logger.debug("Exiting early as we have hit the best possible error.")
+            break
+        population.next_generation()
 
+    print("Best chromosome error {} and chromosome {}".format(best_chromosome.error, best_chromosome))
+    print("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error,
+                                                                         best_chromosome.pretty_string()))

From fee6039a99b70243f2e64215c8a5e6bc71dfcdc0 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Tue, 11 Apr 2017 19:49:36 -0400
Subject: [PATCH 10/51] Updated the readme

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 632143f..711bfd8 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,7 @@
-# py-mep
\ No newline at end of file
+# Multi Expression Programming
+
+This is an implmentation of the MEP algorithm defined here:
+
+Oltean Mihai, D. Dumitrescu, Multi Expression Programming, Technical report, UBB.
+
+Based upon the C++ code here: https://github.com/mepx/mep-basic-src.

From d7f0b04dadb1407aaf53d94d72f644719da6b90c Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Tue, 11 Apr 2017 20:11:59 -0400
Subject: [PATCH 11/51] The environment.yaml to build the conda environment.

---
 README.md        |  4 +-
 environment.yaml | 95 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 environment.yaml

diff --git a/README.md b/README.md
index 711bfd8..78e8182 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,6 @@
 
 This is an implmentation of the MEP algorithm defined here:
 
-Oltean Mihai, D. Dumitrescu, Multi Expression Programming, Technical report, UBB.
+> Oltean Mihai, D. Dumitrescu, Multi Expression Programming, Technical report, UBB.
 
-Based upon the C++ code here: https://github.com/mepx/mep-basic-src.
+Based upon the C++ code [here](https://github.com/mepx/mep-basic-src).
\ No newline at end of file
diff --git a/environment.yaml b/environment.yaml
new file mode 100644
index 0000000..61b1b1c
--- /dev/null
+++ b/environment.yaml
@@ -0,0 +1,95 @@
+name: py-mep-dev
+dependencies:
+- backports=1.0=py27_0
+- backports_abc=0.5=py27_0
+- bleach=1.5.0=py27_0
+- configparser=3.5.0=py27_0
+- dbus=1.10.10=0
+- decorator=4.0.11=py27_0
+- entrypoints=0.2.2=py27_1
+- enum34=1.1.6=py27_0
+- expat=2.1.0=0
+- fontconfig=2.12.1=3
+- freetype=2.5.5=2
+- functools32=3.2.3.2=py27_0
+- get_terminal_size=1.0.0=py27_0
+- glib=2.50.2=1
+- gst-plugins-base=1.8.0=0
+- gstreamer=1.8.0=0
+- html5lib=0.999=py27_0
+- icu=54.1=0
+- ipykernel=4.6.0=py27_0
+- ipython=5.3.0=py27_0
+- ipython_genutils=0.2.0=py27_0
+- ipywidgets=6.0.0=py27_0
+- jinja2=2.9.6=py27_0
+- jpeg=9b=0
+- jsonschema=2.5.1=py27_0
+- jupyter=1.0.0=py27_3
+- jupyter_client=5.0.1=py27_0
+- jupyter_console=5.1.0=py27_0
+- jupyter_core=4.3.0=py27_0
+- libffi=3.2.1=1
+- libgcc=5.2.0=0
+- libiconv=1.14=0
+- libpng=1.6.27=0
+- libsodium=1.0.10=0
+- libxcb=1.12=1
+- libxml2=2.9.4=0
+- markupsafe=0.23=py27_2
+- mistune=0.7.4=py27_0
+- mkl=2017.0.1=0
+- nbconvert=5.1.1=py27_0
+- nbformat=4.3.0=py27_0
+- notebook=5.0.0=py27_0
+- numpy=1.12.1=py27_0
+- openssl=1.0.2k=1
+- pandas=0.19.2=np112py27_1
+- pandocfilters=1.4.1=py27_0
+- path.py=10.1=py27_0
+- pathlib2=2.2.1=py27_0
+- pcre=8.39=1
+- pexpect=4.2.1=py27_0
+- pickleshare=0.7.4=py27_0
+- pip=9.0.1=py27_1
+- prompt_toolkit=1.0.14=py27_0
+- ptyprocess=0.5.1=py27_0
+- py=1.4.32=py27_0
+- pygments=2.2.0=py27_0
+- pyqt=5.6.0=py27_2
+- pytest=3.0.7=py27_0
+- python=2.7.13=0
+- python-dateutil=2.6.0=py27_0
+- pytz=2017.2=py27_0
+- pyzmq=16.0.2=py27_0
+- qt=5.6.2=3
+- qtconsole=4.3.0=py27_0
+- readline=6.2=2
+- scandir=1.5=py27_0
+- setuptools=27.2.0=py27_0
+- simplegeneric=0.8.1=py27_1
+- singledispatch=3.4.0.3=py27_0
+- sip=4.18=py27_0
+- six=1.10.0=py27_0
+- sqlite=3.13.0=0
+- ssl_match_hostname=3.4.0.2=py27_1
+- terminado=0.6=py27_0
+- testpath=0.3=py27_0
+- tk=8.5.18=0
+- tornado=4.4.2=py27_0
+- traitlets=4.3.2=py27_0
+- wcwidth=0.1.7=py27_0
+- wheel=0.29.0=py27_0
+- widgetsnbextension=2.0.0=py27_0
+- zeromq=4.1.5=0
+- zlib=1.2.8=3
+- pip:
+  - backports-abc==0.5
+  - backports.shutil-get-terminal-size==1.0.0
+  - backports.ssl-match-hostname==3.4.0.2
+  - ipython-genutils==0.2.0
+  - jupyter-client==5.0.1
+  - jupyter-console==5.1.0
+  - jupyter-core==4.3.0
+  - prompt-toolkit==1.0.14
+

From b0e082d27eaba41ab7f9da2adbb3fd66887fbead Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Tue, 11 Apr 2017 20:27:26 -0400
Subject: [PATCH 12/51] Example datasets.

---
 .gitignore         |  3 ++-
 README.md          | 13 ++++++++++++-
 datasets/data1.csv | 10 ++++++++++
 datasets/data2.csv | 10 ++++++++++
 datasets/files.txt |  2 ++
 5 files changed, 36 insertions(+), 2 deletions(-)
 create mode 100644 datasets/data1.csv
 create mode 100644 datasets/data2.csv
 create mode 100644 datasets/files.txt

diff --git a/.gitignore b/.gitignore
index 8419443..9eda0dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,4 +92,5 @@ ENV/
 .idea/*
 
 # logs
-output_logs/*
\ No newline at end of file
+output_logs/*
+ignored/*
\ No newline at end of file
diff --git a/README.md b/README.md
index 78e8182..192029c 100644
--- a/README.md
+++ b/README.md
@@ -4,4 +4,15 @@ This is an implmentation of the MEP algorithm defined here:
 
 > Oltean Mihai, D. Dumitrescu, Multi Expression Programming, Technical report, UBB.
 
-Based upon the C++ code [here](https://github.com/mepx/mep-basic-src).
\ No newline at end of file
+Based upon the C++ code [here](https://github.com/mepx/mep-basic-src).
+
+## Running py-mep
+
+Create the conda environment and source it (Linux):
+
+```
+conda env create -f environment.yml
+source activate py-mep-dev
+```
+
+Example, running with a dataset `python -m mep.main datasets/data1.csv`
\ No newline at end of file
diff --git a/datasets/data1.csv b/datasets/data1.csv
new file mode 100644
index 0000000..2a906b5
--- /dev/null
+++ b/datasets/data1.csv
@@ -0,0 +1,10 @@
+x1,x2,target
+0,0,0
+1,2,3
+12,2,14
+-12,90,78
+3,4,7
+0,-1,-1
+23,0,23
+8,16,24
+-10,-15,-25
diff --git a/datasets/data2.csv b/datasets/data2.csv
new file mode 100644
index 0000000..f839365
--- /dev/null
+++ b/datasets/data2.csv
@@ -0,0 +1,10 @@
+x1,x2,x3,target
+0,0,0,0
+1,2,2,4
+12,2,4,28
+-12,90,5,-1075
+3,4,1,13
+0,-1,-10,-10
+23,0,15,15
+8,16,1,129
+-10,-15,-50,100
diff --git a/datasets/files.txt b/datasets/files.txt
new file mode 100644
index 0000000..d8aa9d8
--- /dev/null
+++ b/datasets/files.txt
@@ -0,0 +1,2 @@
+data1.csv is f(x_1, x_2) = x_1 + x_2
+data2.csv is f(x_1, x_2) = x_1 * x_2 + x_3

From bdf3ab2a4e5800ac699f52fccd94e794ead1ed9e Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Fri, 14 Apr 2017 12:14:31 -0400
Subject: [PATCH 13/51] Fix the missing 's' in the logger format.

---
 mep/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mep/main.py b/mep/main.py
index e7abdd6..d8afb6e 100644
--- a/mep/main.py
+++ b/mep/main.py
@@ -27,7 +27,7 @@
     logging.basicConfig(filename="{}/MEP_{}.log".format(output_logs_dir, dt.datetime.now().strftime("%Y%m%d")),
                         level=logging.DEBUG,
                         filemode='w',
-                        format="%(asctime)s %(name)s %(funcName)s %(levelname) %(message)s")
+                        format="%(asctime)s %(name)s %(funcName)s %(levelname)s %(message)s")
     logger = logging.getLogger("main")
     logger.info("Starting up...")
 

From 545d2717d63a6242a9064b7d56949abbbadf7eb7 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Fri, 14 Apr 2017 20:01:27 -0400
Subject: [PATCH 14/51] Logic to prune the unused genes.

---
 mep/genetics/chromosome.py | 51 +++++++++++++++++++++++++++++++++++++-
 mep/main.py                | 17 +++++++++----
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index ca96f4f..172f658 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -1,5 +1,6 @@
 import logging
 import numpy as np
+from collections import deque
 from mep.genetics.gene import Gene, VariableGene, OperatorGene
 from random import random, randint, choice
 
@@ -24,7 +25,7 @@ def __init__(self, genes, constants):
         :param constants: the constants
         :type constants: list of float
         """
-        # self.logger = logging.getLogger(self.__class__)
+        self.logger = logging.getLogger(self.__class__.__name__)
 
         # core genes and constants lists
         self.genes = genes
@@ -192,6 +193,54 @@ def pretty_string(self, stop_at_best=True):
         # if we want to print the full program
         return program
 
+    def prune(self):
+        """
+        Trim out the unused genes. NOTE: This "breaks" the chromosomes as it is going to change how many genes are
+        in the program. Only do this once we have finished evolving the program.
+        """
+        # the best gene index is going to be the last line of the program; since the genes never reference genes
+        # beyond it then we just proceed back to the top and remove any which haven't been referenced; we determine
+        # this via a BFS type search
+
+        # the genes that are in use -- i.e. that will be kept;
+        gene_indices_in_use = set()
+        visited = set()
+
+        # start from best gene index
+        genes_indices_to_visit = deque()
+        genes_indices_to_visit.appendleft(self.best_gene_index)
+        gene_indices_in_use.add(self.best_gene_index)
+
+        while len(genes_indices_to_visit) > 0:
+            # the index to visit
+            gene_index = genes_indices_to_visit.pop()
+
+            # mark as visited
+            visited.add(gene_index)
+
+            # check the addresses on the gene if it is an operator
+            gene = self.genes[gene_index]
+            if type(gene) == OperatorGene:
+                genes_indices_to_visit.appendleft(gene.address1)
+                genes_indices_to_visit.appendleft(gene.address2)
+                gene_indices_in_use.add(gene.address1)
+                gene_indices_in_use.add(gene.address2)
+                self.logger.debug("At gene index {} which references {} and {}".format(gene_index,
+                                                                                       gene.address1, gene.address2))
+
+        # now remove any genes that aren't used
+        gene_indices_in_use = list(gene_indices_in_use)
+        gene_indices_in_use.sort()
+        self.logger.debug("All gene indices in use {}".format(gene_indices_in_use))
+        self.genes = [self.genes[i] for i in gene_indices_in_use]
+
+        # TODO: This could be done in the list comprehension but it is clearer to just do another pass
+        # re-map the address to the new index
+        for gene in self.genes:
+            if type(gene) == OperatorGene:
+                gene.address1 = gene_indices_in_use.index(gene.address1)
+                gene.address2 = gene_indices_in_use.index(gene.address2)
+
     def __repr__(self):
         return self.__str__()
 
diff --git a/mep/main.py b/mep/main.py
index d8afb6e..985bdd1 100644
--- a/mep/main.py
+++ b/mep/main.py
@@ -45,13 +45,20 @@
         best_chromosome = population.chromosomes[0]
         logger.debug("Generation number {} best chromosome error {}".format(generation,
                                                                             best_chromosome.error))
-        print("Generation number {} best chromosome error {}".format(generation,
-                                                                     best_chromosome.error))
         if best_chromosome.error == 0:
             logger.debug("Exiting early as we have hit the best possible error.")
             break
         population.next_generation()
 
-    print("Best chromosome error {} and chromosome {}".format(best_chromosome.error, best_chromosome))
-    print("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error,
-                                                                         best_chromosome.pretty_string()))
+    logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error,
+                                                                                best_chromosome.pretty_string()))
+
+    # prune out the unused genes
+    best_chromosome.prune()
+    logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error,
+                                                                                best_chromosome.pretty_string()))
+
+    # TODO: Convert the output to a valid python program
+    # TODO: Add support for classification
+    # TODO: Add example digital circuit test
+    # TODO: Add UDFs

From dacec6d1f859511d4d7ed8309d22ee2a50ece2e7 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sun, 16 Apr 2017 13:33:35 -0400
Subject: [PATCH 15/51] Convert to a python program.

---
 mep/genetics/chromosome.py | 49 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index 172f658..f0e8c5c 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -241,6 +241,55 @@ def prune(self):
                 gene.address1 = gene_indices_in_use.index(gene.address1)
                 gene.address2 = gene_indices_in_use.index(gene.address2)
 
+    def to_python(self):
+        """
+        Convert to python program string.
+        :return: python string program
+        :rtype: str
+        """
+        # python program string
+        python_program = """
+import sys
+
+if __name__ == "__main__":
+    # constants
+    {}
+
+    # now the genes
+    {}
+
+    # print out the final answer
+    {}
+    """
+
+        # constants
+        constants_str = "constants = {}".format(self.constants)
+
+        # genes
+        genes_str = "program = [0] * {}\n".format(len(self.genes))
+        for gene_index, gene in enumerate(self.genes):
+            genes_str += "    program[{}] = ".format(gene_index)
+            if type(gene) == VariableGene:
+                if gene.is_feature:
+                    genes_str += "float(sys.argv[{}])".format(gene.index + 1)
+                else:
+                    genes_str += "constants[{}]".format(gene.index)
+            elif type(gene) == OperatorGene:
+                if gene.operation == Chromosome.operator_lambdas[0]:
+                    op = "+"
+                elif gene.operation == Chromosome.operator_lambdas[1]:
+                    op = "-"
+                elif gene.operation == Chromosome.operator_lambdas[2]:
+                    op = "*"
+                genes_str += "program[{}] {} program[{}]".format(gene.address1, op, gene.address2)
+            genes_str += "\n"
+
+        # print statement
+        python_program = python_program.format(constants_str, genes_str, "print(program[{}])".format(len(self.genes)-1))
+
+        # return it
+        return python_program
+
     def __repr__(self):
         return self.__str__()
 

From 6dff56eda7d5c65f1c1d9bdf05604346f27be031 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Mon, 1 May 2017 21:00:43 -0400
Subject: [PATCH 16/51] Print the python program into a file.

---
 README.md   |  2 +-
 mep/main.py | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 192029c..24ee508 100644
--- a/README.md
+++ b/README.md
@@ -15,4 +15,4 @@ conda env create -f environment.yml
 source activate py-mep-dev
 ```
 
-Example, running with a dataset `python -m mep.main datasets/data1.csv`
\ No newline at end of file
+Example, running with a dataset `python -m mep.main datasets/data1.csv test.py`. This will run a full MEP population evolution to solve the problem specified in the data CSV, determine the best chromosome, prune it, and then convert that chromosome into a functioning python program that can be run by passing in the feature inputs. Example, `python test.py 5 10`.
\ No newline at end of file
diff --git a/mep/main.py b/mep/main.py
index 985bdd1..74f8767 100644
--- a/mep/main.py
+++ b/mep/main.py
@@ -11,6 +11,7 @@
 
     # get the data file
     data_set_name = sys.argv[1]
+    python_file_name = sys.argv[2]
     data_set = DataSet(data_set_name)
 
     # read config file
@@ -53,11 +54,20 @@
     logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error,
                                                                                 best_chromosome.pretty_string()))
 
+    # TODO: this should probably be optional
     # prune out the unused genes
     best_chromosome.prune()
     logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error,
                                                                                 best_chromosome.pretty_string()))
 
+    # TODO: Optional?
+    # we then convert the chromosome into a valid python program and write it out to file
+    with open(python_file_name, 'w') as python_file:
+        python_program = best_chromosome.to_python()
+        logger.debug("Write out the python program to {}".format(python_file_name))
+        logger.debug(python_program)
+        python_file.write(python_program)
+
     # TODO: Convert the output to a valid python program
     # TODO: Add support for classification
     # TODO: Add example digital circuit test

From c87cd6143d7eb2a6d78d791ac956a1bcfd1e903f Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Mon, 28 Aug 2017 20:01:43 -0400
Subject: [PATCH 17/51] Fix the README's filename and fix the broken test.

---
 README.md                             | 2 +-
 tests/mep/genetics/test_population.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 24ee508..ad723ed 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Based upon the C++ code [here](https://github.com/mepx/mep-basic-src).
 Create the conda environment and source it (Linux):
 
 ```
-conda env create -f environment.yml
+conda env create -f environment.yaml
 source activate py-mep-dev
 ```
 
diff --git a/tests/mep/genetics/test_population.py b/tests/mep/genetics/test_population.py
index 29b235f..bce9cd5 100644
--- a/tests/mep/genetics/test_population.py
+++ b/tests/mep/genetics/test_population.py
@@ -19,7 +19,7 @@ def test_random_tournament_selection(self):
         # construct the population
         num_examples = 5
         num_features = 7
-        population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1, 1)
+        population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1)
 
         # confirm the number of feature variables (not critical for this test)
         self.assertEqual(num_features, population.num_feature_variables)
@@ -41,7 +41,7 @@ def test_larger_random_tournament_selection(self):
         # construct the population
         num_examples = 5
         num_features = 7
-        population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1, 1)
+        population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1)
 
         # test the tournament selection; not that it randomly chooses the not as good chromosome
         min_chromosome, max_chromosome = Chromosome([], []), Chromosome([], [])

From ba35e89259b40dce3c3dd8cc6d9675ea8c1decd5 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Mon, 28 Aug 2017 20:08:50 -0400
Subject: [PATCH 18/51] Bug fix, stop doing an insert, the population size
 should be fixed.

---
 mep/genetics/population.py            | 4 ++--
 mep/main.py                           | 5 +++--
 tests/mep/genetics/test_population.py | 1 +
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/mep/genetics/population.py b/mep/genetics/population.py
index 2aea971..dd60592 100644
--- a/mep/genetics/population.py
+++ b/mep/genetics/population.py
@@ -164,7 +164,7 @@ def next_generation(self):
                     break
             # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index
             if insert_index > -1:
-                self.chromosomes.insert(insert_index, offspring1)
+                self.chromosomes[insert_index] = offspring1
 
             # now the other offspring
             insert_index = -1
@@ -174,5 +174,5 @@ def next_generation(self):
                     break
             # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index
             if insert_index > -1:
-                self.chromosomes.insert(insert_index, offspring2)
+                self.chromosomes[insert_index] = offspring2
 
diff --git a/mep/main.py b/mep/main.py
index 74f8767..e3528ab 100644
--- a/mep/main.py
+++ b/mep/main.py
@@ -44,8 +44,9 @@
     best_chromosome = None
     for generation in range(int(config["num_generations"])):
         best_chromosome = population.chromosomes[0]
-        logger.debug("Generation number {} best chromosome error {}".format(generation,
-                                                                            best_chromosome.error))
+        logger.debug("Generation number {} best chromosome error {} with {} chromosomes ".format(
+            generation, best_chromosome.error, len(population.chromosomes)))
+
         if best_chromosome.error == 0:
             logger.debug("Exiting early as we have hit the best possible error.")
             break
diff --git a/tests/mep/genetics/test_population.py b/tests/mep/genetics/test_population.py
index bce9cd5..78892b6 100644
--- a/tests/mep/genetics/test_population.py
+++ b/tests/mep/genetics/test_population.py
@@ -4,6 +4,7 @@
 from mep.genetics.population import Population
 from mep.genetics.chromosome import Chromosome
 
+
 class TestPopulation(unittest.TestCase):
     """
     Test the Population class.

From 5f2fbc587d1e505ef3205d09cf1efa6f5a7d858b Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 30 Dec 2017 16:07:26 -0500
Subject: [PATCH 19/51] This puts in an example evolved program in the README

---
 README.md | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ad723ed..64312ca 100644
--- a/README.md
+++ b/README.md
@@ -15,4 +15,23 @@ conda env create -f environment.yaml
 source activate py-mep-dev
 ```
 
-Example, running with a dataset `python -m mep.main datasets/data1.csv test.py`. This will run a full MEP population evolution to solve the problem specified in the data CSV, determine the best chromosome, prune it, and then convert that chromosome into a functioning python program that can be run by passing in the feature inputs. Example, `python test.py 5 10`.
\ No newline at end of file
+Example, running with a dataset `python -m mep.main datasets/data1.csv test.py`. This will run a full MEP population evolution to solve the problem specified in the data CSV, determine the best chromosome, prune it, and then convert that chromosome into a functioning python program that can be run by passing in the feature inputs. Example, `python test.py 5 10`.
+
+An example Python program evolved to solve the addition problem of adding together two features (ex: datasets/data1.csv):
+```
+import sys
+
+if __name__ == "__main__":
+    # constants
+    constants = [0.45084442258242485, -0.464331279636617, -0.5128830066318446]
+
+    # now the genes
+    program = [0] * 3
+    program[0] = float(sys.argv[2])
+    program[1] = float(sys.argv[1])
+    program[2] = program[0] + program[1]
+
+
+    # print out the final answer
+    print(program[2])
+```
\ No newline at end of file

From ed408fd3318466bf367beefe7f331ebd0953f84f Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 30 Dec 2017 16:18:48 -0500
Subject: [PATCH 20/51] Add another function example.

---
 datasets/data3.csv | 20 ++++++++++++++++++++
 datasets/files.txt |  1 +
 2 files changed, 21 insertions(+)
 create mode 100644 datasets/data3.csv

diff --git a/datasets/data3.csv b/datasets/data3.csv
new file mode 100644
index 0000000..ae9e86e
--- /dev/null
+++ b/datasets/data3.csv
@@ -0,0 +1,20 @@
+x1,x2,x3,target
+1,2,3,0
+3,2,1,10
+2,3,1,6
+0,0,0,0
+5,4,10,19
+7,6,4,51
+-1,2,5,-2
+2,-1,6,-3
+3,4,-10,23
+5,6,9,22
+-3,-6,-6,9
+-7,4,0,53
+0,2,5,-3
+6,0,-1,37
+-8,5,0,69
+-3,-3,-3,9
+2,1,2,3
+-6,-3,2,31
+0,9,4,5
diff --git a/datasets/files.txt b/datasets/files.txt
index d8aa9d8..cbb59c7 100644
--- a/datasets/files.txt
+++ b/datasets/files.txt
@@ -1,2 +1,3 @@
 data1.csv is f(x_1, x_2) = x_1 + x_2
 data2.csv is f(x_1, x_2) = x_1 * x_2 + x_3
+data3.csv is f(x_1, x_2, x_3) = x_1 * x_1 + x_2 - x_3

From 5b38a5f6e2580465c0ab1f11847e8388302869c9 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 30 Dec 2017 17:55:44 -0500
Subject: [PATCH 21/51] Instead of just lambdas let's use callable objects.

---
 mep/genetics/chromosome.py |  37 ++++++-------
 mep/genetics/gene.py       |   3 +-
 mep/genetics/operator.py   | 103 +++++++++++++++++++++++++++++++++++++
 3 files changed, 121 insertions(+), 22 deletions(-)
 create mode 100644 mep/genetics/operator.py

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index f0e8c5c..c7d5738 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -2,6 +2,7 @@
 import numpy as np
 from collections import deque
 from mep.genetics.gene import Gene, VariableGene, OperatorGene
+from mep.genetics.operator import AdditionOperator, MultiplicationOperator, SubtractionOperator
 from random import random, randint, choice
 
 
@@ -13,9 +14,9 @@ class Chromosome(object):
     """
 
     # valid operators
-    operator_lambdas = [lambda a, b: a + b,  # +
-                        lambda a, b: a - b,  # -
-                        lambda a, b: a * b]  # *
+    operators_family = [AdditionOperator,
+                        MultiplicationOperator,
+                        SubtractionOperator]
 
     def __init__(self, genes, constants):
         """
@@ -80,7 +81,7 @@ def generate_random_chromosome(cls, num_constants, constants_min, constants_max,
             prob = random()
             if prob <= operators_prob:
                 # randomly choose valid addresses; randomly choose an operator
-                genes.append(OperatorGene(choice(Chromosome.operator_lambdas),
+                genes.append(OperatorGene(choice(Chromosome.operators_family)(),
                                           randint(0, gene_index - 1), randint(0, gene_index - 1)))
             elif prob <= operators_prob + feature_variable_prob:
                 genes.append(VariableGene(randint(0, num_feature_variables - 1), is_feature=True))
@@ -176,15 +177,8 @@ def pretty_string(self, stop_at_best=True):
             if type(gene) == VariableGene:
                 gene_str = gene.pretty_string()
             elif type(gene) == OperatorGene:
-                # TODO: Push this logic into the gene; the only tricky part is the operator lambda; we will probably
-                # need to replace the lambda with a larger object
-                if gene.operation == Chromosome.operator_lambdas[0]:
-                    op = "+"
-                elif gene.operation == Chromosome.operator_lambdas[1]:
-                    op = "-"
-                elif gene.operation == Chromosome.operator_lambdas[2]:
-                    op = "*"
-                gene_str = "PROGRAM[{}] {} PROGRAM[{}]".format(gene.address1, op, gene.address2)
+                gene_str = "{}(PROGRAM[{}], PROGRAM[{}])".format(gene.operation.function_name(),
+                                                                 gene.address1, gene.address2)
             program += "{}:{}\n".format(gene_index, gene_str)
 
             if self.best_gene_index == gene_index and stop_at_best:
@@ -251,6 +245,9 @@ def to_python(self):
         python_program = """
 import sys
 
+# define operator/functions
+{}
+
 if __name__ == "__main__":
     # constants
     {}
@@ -261,6 +258,8 @@ def to_python(self):
     # print out the final answer
     {}
     """
+        # define all the function/operators
+        operator_def_str = "\n".join([operator().function_python_definition() for operator in self.operators_family])
 
         # constants
         constants_str = "constants = {}".format(self.constants)
@@ -275,17 +274,13 @@ def to_python(self):
                 else:
                     genes_str += "constants[{}]".format(gene.index)
             elif type(gene) == OperatorGene:
-                if gene.operation == Chromosome.operator_lambdas[0]:
-                    op = "+"
-                elif gene.operation == Chromosome.operator_lambdas[1]:
-                    op = "-"
-                elif gene.operation == Chromosome.operator_lambdas[2]:
-                    op = "*"
-                genes_str += "program[{}] {} program[{}]".format(gene.address1, op, gene.address2)
+                genes_str += "{}(program[{}], program[{}])".format(gene.operation.function_name(),
+                                                                   gene.address1, gene.address2)
             genes_str += "\n"
 
         # print statement
-        python_program = python_program.format(constants_str, genes_str, "print(program[{}])".format(len(self.genes)-1))
+        python_program = python_program.format(operator_def_str, constants_str, genes_str,
+                                               "print(program[{}])".format(len(self.genes)-1))
 
         # return it
         return python_program
diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py
index b1adc0a..a83c00b 100644
--- a/mep/genetics/gene.py
+++ b/mep/genetics/gene.py
@@ -31,7 +31,8 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
         """
 
 
-# TODO: Should we also add a mutate method to the gene itself?
+# NOTE: Should we also add a mutate method to the gene itself? Considering that we are doing the mutation by doing
+# a crossover of the whole chromosome with a new random chromosome, I don't think there is any benefit.
 
 
 class VariableGene(object):
diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py
new file mode 100644
index 0000000..4a0060d
--- /dev/null
+++ b/mep/genetics/operator.py
@@ -0,0 +1,103 @@
+from abc import ABCMeta, abstractmethod
+
+
+class Operator(object):
+    """
+    This is more of a function than a traditional "operator" but the function could be simply using an operator
+    like "+", "-", etc. At it's core these are indivisible functions that take arguments (i.e. preceding genes) and
+    output some value.
+    """
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def __call__(self, *args, **kwargs):
+        """
+        Run the operation/function and return the result.
+        """
+
+    @abstractmethod
+    def function_name(self):
+        """
+        Return the name of the function for use in the pretty print and the python program.
+        """
+
+    @abstractmethod
+    def function_python_definition(self):
+        """
+        Return the python definition of the function
+        """
+
+
+# TODO: Consolidate these into just one Operator?
+class AdditionOperator(Operator):
+    """
+    Perform addition.
+    """
+    # NOTE: there is no need to support more than two arguments as these can be chained across multiple genes
+
+    def __call__(self, *args, **kwargs):
+        """
+        Perform addition.
+        """
+        return sum(args)
+
+    def function_name(self):
+        return "add"
+
+    def function_python_definition(self):
+        return """
+def add(x, y):
+    return x + y
+        """
+
+
+class MultiplicationOperator(Operator):
+    """
+    Perform multiplication
+    """
+    # NOTE: there is no need to support more than two arguments as these can be chained across multiple genes
+
+    def __call__(self, *args, **kwargs):
+        """
+        Perform subtraction.
+        """
+        result = 1
+        for arg in args:
+            result *= arg
+
+        return result
+
+    def function_name(self):
+        return "multiplication"
+
+    def function_python_definition(self):
+        return """
+def multiplication(x, y):
+    return x * y
+        """
+
+
+class SubtractionOperator(Operator):
+    """
+    Perform subtraction.
+    """
+    # NOTE: there is no need to support more than two arguments as these can be chained across multiple genes
+
+    def __call__(self, *args, **kwargs):
+        """
+        Perform subtraction.
+        """
+        result = args[0]
+        for arg in args[1:]:
+            result -= arg
+
+        return result
+
+    def function_name(self):
+        return "subtraction"
+
+    def function_python_definition(self):
+        return """
+def subtraction(x, y):
+    return x - y
+        """
\ No newline at end of file

From e38fffc53ca68ebfda8ae673529307e4907dc12d Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 13 Jan 2018 15:04:09 -0500
Subject: [PATCH 22/51] Fix the test_chromosome and add initial operator test.

---
 mep/genetics/gene.py                  |  4 +++-
 mep/genetics/operator.py              |  1 +
 tests/mep/genetics/test_chromosome.py |  2 +-
 tests/mep/genetics/test_operator.py   | 15 +++++++++++++++
 4 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 tests/mep/genetics/test_operator.py

diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py
index a83c00b..fa9d12d 100644
--- a/mep/genetics/gene.py
+++ b/mep/genetics/gene.py
@@ -186,4 +186,6 @@ def __repr__(self):
     def __eq__(self, other):
         if other is None or not isinstance(other, OperatorGene):
             return False
-        return self.operation == other.operation and self.address1 == other.address1 and self.address2 == other.address2
+
+        # NOTE: the operators are the same if they are of the same type
+        return isinstance(self.operation, type(other.operation)) and self.address1 == other.address1 and self.address2 == other.address2
diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py
index 4a0060d..c0f5b41 100644
--- a/mep/genetics/operator.py
+++ b/mep/genetics/operator.py
@@ -1,6 +1,7 @@
 from abc import ABCMeta, abstractmethod
 
 
+# TODO: add some more interesting operators; example pow(...), log(...), exp(...), min(...), max(...)
 class Operator(object):
     """
     This is more of a function than a traditional "operator" but the function could be simply using an operator
diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py
index f498274..15081a1 100644
--- a/tests/mep/genetics/test_chromosome.py
+++ b/tests/mep/genetics/test_chromosome.py
@@ -50,7 +50,7 @@ def test_basic_random_construction(self):
         self.assertEquals(VariableGene(0, is_feature=False), chromosome.genes[0])
 
         # the 2nd gene can be a variable or an operator; in this case it is the below
-        self.assertEquals(OperatorGene(Chromosome.operator_lambdas[1], 0, 0), chromosome.genes[1])
+        self.assertEquals(OperatorGene(Chromosome.operators_family[1](), 0, 0), chromosome.genes[1])
 
         # verify constant
         self.assertAlmostEquals(8.599796663725433, chromosome.constants[0])
diff --git a/tests/mep/genetics/test_operator.py b/tests/mep/genetics/test_operator.py
new file mode 100644
index 0000000..04133a5
--- /dev/null
+++ b/tests/mep/genetics/test_operator.py
@@ -0,0 +1,15 @@
+import unittest
+from mep.genetics.operator import MultiplicationOperator
+
+
+class TestOperators(unittest.TestCase):
+    """
+    Test the Operator classes
+    """
+
+    def test_multiplication_operator(self):
+        """
+        """
+        # construct the oeprator
+        operator = MultiplicationOperator()
+        self.assertEquals(5 * 2, operator(5, 2))
\ No newline at end of file

From 27023d02c187b35d541652d77092e361eb1f2701 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 13 Jan 2018 15:08:17 -0500
Subject: [PATCH 23/51] Test the other operators.

---
 tests/mep/genetics/test_operator.py | 35 ++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/tests/mep/genetics/test_operator.py b/tests/mep/genetics/test_operator.py
index 04133a5..bb5f630 100644
--- a/tests/mep/genetics/test_operator.py
+++ b/tests/mep/genetics/test_operator.py
@@ -1,5 +1,5 @@
 import unittest
-from mep.genetics.operator import MultiplicationOperator
+from mep.genetics.operator import MultiplicationOperator, AdditionOperator, SubtractionOperator
 
 
 class TestOperators(unittest.TestCase):
@@ -10,6 +10,35 @@ class TestOperators(unittest.TestCase):
     def test_multiplication_operator(self):
         """
         """
-        # construct the oeprator
+        # construct the operator and test it
         operator = MultiplicationOperator()
-        self.assertEquals(5 * 2, operator(5, 2))
\ No newline at end of file
+        self.assertEquals(5 * 2, operator(5, 2))
+        self.assertEquals("multiplication", operator.function_name())
+        self.assertEquals("""
+def multiplication(x, y):
+    return x * y
+        """, operator.function_python_definition())
+
+    def test_addition_operator(self):
+            """
+            """
+            # construct the operator and test it
+            operator = AdditionOperator()
+            self.assertEquals(5 + 2, operator(5, 2))
+            self.assertEquals("add", operator.function_name())
+            self.assertEquals("""
+def add(x, y):
+    return x + y
+        """, operator.function_python_definition())
+
+    def test_subtraction_operator(self):
+                """
+                """
+                # construct the operator and test it
+                operator = SubtractionOperator()
+                self.assertEquals(5 - 2, operator(5, 2))
+                self.assertEquals("subtraction", operator.function_name())
+                self.assertEquals("""
+def subtraction(x, y):
+    return x - y
+        """, operator.function_python_definition())
\ No newline at end of file

From b5116a1b140ba9ffae1806c4b2e34fe49f10a1ec Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Sat, 13 Jan 2018 15:17:31 -0500
Subject: [PATCH 24/51] This adds the min and max operators.

---
 mep/genetics/chromosome.py            |  5 ++-
 mep/genetics/operator.py              | 44 ++++++++++++++++++++++++++-
 tests/mep/genetics/test_chromosome.py |  2 +-
 tests/mep/genetics/test_operator.py   | 25 +++++++++++++++
 4 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index c7d5738..e93230b 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -3,6 +3,7 @@
 from collections import deque
 from mep.genetics.gene import Gene, VariableGene, OperatorGene
 from mep.genetics.operator import AdditionOperator, MultiplicationOperator, SubtractionOperator
+from mep.genetics.operator import MinOperator, MaxOperator
 from random import random, randint, choice
 
 
@@ -16,7 +17,9 @@ class Chromosome(object):
     # valid operators
     operators_family = [AdditionOperator,
                         MultiplicationOperator,
-                        SubtractionOperator]
+                        SubtractionOperator,
+                        MinOperator,
+                        MaxOperator]
 
     def __init__(self, genes, constants):
         """
diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py
index c0f5b41..4c4e0fd 100644
--- a/mep/genetics/operator.py
+++ b/mep/genetics/operator.py
@@ -1,7 +1,7 @@
 from abc import ABCMeta, abstractmethod
 
 
-# TODO: add some more interesting operators; example pow(...), log(...), exp(...), min(...), max(...)
+# TODO: add some more interesting operators; example pow(...), log(...), exp(...)
 class Operator(object):
     """
     This is more of a function than a traditional "operator" but the function could be simply using an operator
@@ -101,4 +101,46 @@ def function_python_definition(self):
         return """
 def subtraction(x, y):
     return x - y
+        """
+
+
+class MinOperator(Operator):
+    """
+    Perform the Min operation.
+    """
+
+    def __call__(self, *args, **kwargs):
+        """
+        Perform min
+        """
+        return min(args)
+
+    def function_name(self):
+        return "min_"
+
+    def function_python_definition(self):
+        return """
+def min_(x, y):
+    return min(x, y)
+        """
+
+
+class MaxOperator(Operator):
+    """
+    Perform the Max operation.
+    """
+
+    def __call__(self, *args, **kwargs):
+        """
+        Perform max
+        """
+        return max(args)
+
+    def function_name(self):
+        return "max_"
+
+    def function_python_definition(self):
+        return """
+def max_(x, y):
+    return max(x, y)
         """
\ No newline at end of file
diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py
index 15081a1..ac458f7 100644
--- a/tests/mep/genetics/test_chromosome.py
+++ b/tests/mep/genetics/test_chromosome.py
@@ -50,7 +50,7 @@ def test_basic_random_construction(self):
         self.assertEquals(VariableGene(0, is_feature=False), chromosome.genes[0])
 
         # the 2nd gene can be a variable or an operator; in this case it is the below
-        self.assertEquals(OperatorGene(Chromosome.operators_family[1](), 0, 0), chromosome.genes[1])
+        self.assertEquals(OperatorGene(Chromosome.operators_family[2](), 0, 0), chromosome.genes[1])
 
         # verify constant
         self.assertAlmostEquals(8.599796663725433, chromosome.constants[0])
diff --git a/tests/mep/genetics/test_operator.py b/tests/mep/genetics/test_operator.py
index bb5f630..925a715 100644
--- a/tests/mep/genetics/test_operator.py
+++ b/tests/mep/genetics/test_operator.py
@@ -1,5 +1,6 @@
 import unittest
 from mep.genetics.operator import MultiplicationOperator, AdditionOperator, SubtractionOperator
+from mep.genetics.operator import MinOperator, MaxOperator
 
 
 class TestOperators(unittest.TestCase):
@@ -41,4 +42,28 @@ def test_subtraction_operator(self):
                 self.assertEquals("""
 def subtraction(x, y):
     return x - y
+        """, operator.function_python_definition())
+
+    def test_min_operator(self):
+                    """
+                    """
+                    # construct the operator and test it
+                    operator = MinOperator()
+                    self.assertEquals(min(5, 2), operator(5, 2))
+                    self.assertEquals("min_", operator.function_name())
+                    self.assertEquals("""
+def min_(x, y):
+    return min(x, y)
+        """, operator.function_python_definition())
+
+    def test_max_operator(self):
+        """
+        """
+        # construct the operator and test it
+        operator = MaxOperator()
+        self.assertEquals(max(5, 2), operator(5, 2))
+        self.assertEquals("max_", operator.function_name())
+        self.assertEquals("""
+def max_(x, y):
+    return max(x, y)
         """, operator.function_python_definition())
\ No newline at end of file

From c9e2e09a0a4df37910a8dc65d0b631bccb0196e1 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Wed, 4 Jul 2018 14:13:41 -0400
Subject: [PATCH 25/51] Error check usage.

---
 mep/main.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mep/main.py b/mep/main.py
index e3528ab..6e7e089 100644
--- a/mep/main.py
+++ b/mep/main.py
@@ -7,7 +7,13 @@
 from mep.genetics.population import Population
 
 if __name__ == "__main__":
-    # TODO: error check usage
+    if len(sys.argv) != 3:
+        print("ERROR: Expected usage 'python -m mep.main DATA_SET_NAME PYTHON_FILE_NAME'\n" +
+              "     DATA_SET_NAME:    The name (full path) to the data file to train on.\n"
+              "     PYTHON_FILE_NAME: The name (full path) to the python file to write the output program.\n"
+              "Example: 'python -m mep.main datasets/data1.csv test.py'"
+              )
+        sys.exit(-1)
 
     # get the data file
     data_set_name = sys.argv[1]
@@ -69,7 +75,6 @@
         logger.debug(python_program)
         python_file.write(python_program)
 
-    # TODO: Convert the output to a valid python program
     # TODO: Add support for classification
     # TODO: Add example digital circuit test
     # TODO: Add UDFs

From 1a1adfd499f39938571883e487567f1d6f357885 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Wed, 4 Jul 2018 14:27:51 -0400
Subject: [PATCH 26/51] Example data using the max(..) function.

---
 datasets/data4.csv | 51 ++++++++++++++++++++++++++++++++++++++++++++++
 datasets/files.txt |  1 +
 2 files changed, 52 insertions(+)
 create mode 100644 datasets/data4.csv

diff --git a/datasets/data4.csv b/datasets/data4.csv
new file mode 100644
index 0000000..7b48557
--- /dev/null
+++ b/datasets/data4.csv
@@ -0,0 +1,51 @@
+x1,x2,x3,target
+93.0,16.0,-85.0,-7905.0
+-49.0,-5.0,16.0,-80.0
+-41.0,-21.0,79.0,-1659.0
+27.0,81.0,-63.0,-5103.0
+-9.0,-64.0,84.0,-756.0
+81.0,21.0,5.0,405.0
+-20.0,60.0,-75.0,-4500.0
+93.0,-35.0,53.0,4929.0
+0.0,-29.0,80.0,0.0
+-19.0,4.0,-33.0,-132.0
+-88.0,15.0,91.0,1365.0
+-57.0,-100.0,28.0,-1596.0
+91.0,-45.0,-36.0,-3276.0
+-50.0,19.0,-87.0,-1653.0
+-45.0,100.0,48.0,4800.0
+-32.0,22.0,-21.0,-462.0
+59.0,49.0,-46.0,-2714.0
+-40.0,-92.0,5.0,-200.0
+32.0,59.0,-85.0,-5015.0
+-94.0,-29.0,-8.0,232.0
+-21.0,-68.0,-26.0,546.0
+-25.0,26.0,81.0,2106.0
+12.0,25.0,-85.0,-2125.0
+-11.0,40.0,-57.0,-2280.0
+51.0,73.0,7.0,511.0
+100.0,-77.0,-43.0,-4300.0
+-74.0,-35.0,21.0,-735.0
+-34.0,-90.0,-14.0,476.0
+-84.0,-2.0,8.0,-16.0
+-41.0,92.0,-7.0,-644.0
+57.0,-85.0,65.0,3705.0
+38.0,55.0,71.0,3905.0
+-20.0,17.0,-100.0,-1700.0
+-42.0,40.0,-51.0,-2040.0
+-98.0,-52.0,-84.0,4368.0
+80.0,42.0,-4.0,-320.0
+63.0,-32.0,-46.0,-2898.0
+-3.0,-76.0,-19.0,57.0
+-37.0,20.0,-76.0,-1520.0
+-92.0,-24.0,-83.0,1992.0
+-23.0,-64.0,-89.0,2047.0
+94.0,-84.0,65.0,6110.0
+-54.0,3.0,-76.0,-228.0
+59.0,-61.0,49.0,2891.0
+33.0,-51.0,-32.0,-1056.0
+30.0,-22.0,46.0,1380.0
+2.0,36.0,0.0,0.0
+4.0,87.0,-88.0,-7656.0
+61.0,-13.0,-32.0,-1952.0
+46.0,13.0,-76.0,-3496.0
diff --git a/datasets/files.txt b/datasets/files.txt
index cbb59c7..482a2ae 100644
--- a/datasets/files.txt
+++ b/datasets/files.txt
@@ -1,3 +1,4 @@
 data1.csv is f(x_1, x_2) = x_1 + x_2
 data2.csv is f(x_1, x_2) = x_1 * x_2 + x_3
 data3.csv is f(x_1, x_2, x_3) = x_1 * x_1 + x_2 - x_3
+data4.csv is f(x_1, x_2, x_3) = max(x_1, x_2) * x_3
\ No newline at end of file

From 1ef480460b712608de13b20053d32b713b309731 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Wed, 4 Jul 2018 15:35:49 -0400
Subject: [PATCH 27/51] Refactoring to fit into a more scikit-learn approach.

---
 mep/main.py  | 39 +++++-----------------
 mep/model.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 30 deletions(-)
 create mode 100644 mep/model.py

diff --git a/mep/main.py b/mep/main.py
index 6e7e089..d91913f 100644
--- a/mep/main.py
+++ b/mep/main.py
@@ -4,7 +4,7 @@
 import logging
 import os
 from dataset import DataSet
-from mep.genetics.population import Population
+from mep.model import MEPModel
 
 if __name__ == "__main__":
     if len(sys.argv) != 3:
@@ -38,39 +38,18 @@
     logger = logging.getLogger("main")
     logger.info("Starting up...")
 
-    # construct a population and run it for the number of generations specified
-    population = Population(data_set.data_matrix, data_set.target, int(config["num_constants"]),
-                            float(config["constants_min"]), float(config["constants_max"]),
-                            float(config["feature_variables_probability"]),
-                            int(config["code_length"]), int(config["population_size"]),
-                            float(config["operators_probability"]))
-    population.initialize()
-
-    # iterate through the generations
-    best_chromosome = None
-    for generation in range(int(config["num_generations"])):
-        best_chromosome = population.chromosomes[0]
-        logger.debug("Generation number {} best chromosome error {} with {} chromosomes ".format(
-            generation, best_chromosome.error, len(population.chromosomes)))
-
-        if best_chromosome.error == 0:
-            logger.debug("Exiting early as we have hit the best possible error.")
-            break
-        population.next_generation()
-
-    logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error,
-                                                                                best_chromosome.pretty_string()))
-
-    # TODO: this should probably be optional
-    # prune out the unused genes
-    best_chromosome.prune()
-    logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error,
-                                                                                best_chromosome.pretty_string()))
+    # configure the model; then fit it to the training data
+    model = MEPModel(int(config["num_constants"]), float(config["constants_min"]), float(config["constants_max"]),
+                     float(config["feature_variables_probability"]), int(config["code_length"]),
+                     int(config["population_size"]), float(config["operators_probability"]),
+                     int(config["num_generations"]))
+    model.fit(data_set.data_matrix, data_set.target)
+    logger.info("Finished fitting the model")
 
     # TODO: Optional?
     # we then convert the chromosome into a valid python program and write it out to file
     with open(python_file_name, 'w') as python_file:
-        python_program = best_chromosome.to_python()
+        python_program = model.to_python()
         logger.debug("Write out the python program to {}".format(python_file_name))
         logger.debug(python_program)
         python_file.write(python_program)
diff --git a/mep/model.py b/mep/model.py
new file mode 100644
index 0000000..973032b
--- /dev/null
+++ b/mep/model.py
@@ -0,0 +1,93 @@
+import logging
+from mep.genetics.population import Population
+
+
+# NOTE: The idea is to explicitly conform to a scikit-learn type of approach where we can run fit(..) and
+# predict(..) methods on the model
+class MEPModel(object):
+    """
+    Encapsulate the MEP model.
+    """
+
+    def __init__(self, num_constants, constants_min, constants_max, feature_variables_probability, code_length,
+                 population_size, operators_probability, num_generations):
+
+        """
+        Initialize.
+        :param num_constants:
+        :param constants_min:
+        :param constants_max:
+        :param feature_variables_probability:
+        :param code_length:
+        :param population_size:
+        :param operators_probability:
+        :param num_generations:
+        """
+        # logger
+        self.logger = logging.getLogger(self.__class__.__name__)
+
+        # core parameters
+        self.num_constants = num_constants
+        self.constants_min = constants_min
+        self.constants_max = constants_max
+        self.feature_variables_probability = feature_variables_probability
+        self.code_length = code_length
+        self.population_size = population_size
+        self.operators_probability = operators_probability
+        self.num_generations = num_generations
+
+        # the best found chromosome from the evolution process
+        self.best_chromosome = None
+
+    def fit(self, X, y):
+        """
+
+        :param X:
+        :param y:
+        :return:
+        """
+        # construct a population and run it for the number of generations specified
+        population = Population(X, y, self.num_constants,
+                                self.constants_min, self.constants_max,
+                                self.feature_variables_probability,
+                                self.code_length, self.population_size,
+                                self.operators_probability)
+        population.initialize()
+
+        # iterate through the generations
+        for generation in range(self.num_generations):
+            self.best_chromosome = population.chromosomes[0]
+            self.logger.debug("Generation number {} best chromosome error {} with {} chromosomes ".format(
+                generation, self.best_chromosome.error, len(population.chromosomes)))
+
+            if self.best_chromosome.error == 0:
+                self.logger.debug("Exiting early as we have hit the best possible error.")
+                break
+            population.next_generation()
+
+        self.logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(
+            self.best_chromosome.error, self.best_chromosome.pretty_string()))
+
+        # prune out the unused genes
+        self.best_chromosome.prune()
+
+    def predict(self, X):
+        """
+
+        :param X:
+        :return:
+        """
+        # TODO: Fill in logic
+        pass
+
+    # NOTE: These are NOT scikit-learn methods now
+    def to_python(self):
+        """
+        Return a python program which can run the model directly via direct inputs.
+        :return: the python program (string)
+        :rtype: str
+        """
+        if self.best_chromosome is None:
+            raise ValueError("The model hasn't been fit.")
+
+        return self.best_chromosome.to_python()

From 61a320342bcfbb983d614b6a6cc8b21a47e681c6 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Fri, 6 Jul 2018 17:43:25 -0400
Subject: [PATCH 28/51] Adds the predict logic.

---
 mep/genetics/chromosome.py | 23 +++++++++++++++++++++++
 mep/model.py               | 20 +++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index e93230b..83e924f 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -113,6 +113,26 @@ def evaluate(self, data_matrix, targets):
                 self.error = error
                 self.best_gene_index = gene_index
 
+    def predict(self, data_matrix):
+        """
+        Return the predictions for this data.
+        :param data_matrix: the sample data; matrix with (n_samples, n_features)
+        :type data_matrix: np.matrix
+        :return: the prediction for each sample; array-like (n_samples) length
+        :rtype: np.array
+        """
+        # NOTE: This is almost identical to evaluate except that we are running after we have done the fit so we have
+        # already determined the best gene index and we just want to calculate the values; no error calc
+        num_examples = data_matrix.shape[0]
+        eval_matrix = np.zeros((len(self.genes), num_examples))
+        dummy_targets = [0] * num_examples
+        for gene_index, gene in enumerate(self.genes):
+            # compute the error for this gene; if it is the best we have found then update
+            gene.evaluate(gene_index, eval_matrix, data_matrix, self.constants, dummy_targets)
+            if self.best_gene_index == gene_index:
+                # extract from the eval_matrix; these from this gene (line in program) for each of the examples
+                return eval_matrix[gene_index, :]
+
     def mutate(self, gene_mutation_prob, num_constants, constants_min, constants_max, constants_prob,
                feature_variable_prob, num_feature_variables, num_genes, operators_prob):
         """
@@ -238,6 +258,9 @@ def prune(self):
                 gene.address1 = gene_indices_in_use.index(gene.address1)
                 gene.address2 = gene_indices_in_use.index(gene.address2)
 
+        # the now "best gene" is just the last one
+        self.best_gene_index = len(self.genes) - 1
+
     def to_python(self):
         """
         Convert to python program string.
diff --git a/mep/model.py b/mep/model.py
index 973032b..7a3d529 100644
--- a/mep/model.py
+++ b/mep/model.py
@@ -73,11 +73,29 @@ def fit(self, X, y):
 
     def predict(self, X):
         """
+        Return the predictions for this data.
+        :param X: the sample data; matrix with (n_samples, n_features)
+        :type X: np.matrix
+        :return: the prediction for each sample; array-like (n_samples) length
+        :rtype: np.array
+        """
+        return self.best_chromosome.predict(X)
+
+    def score(self, X):
+        """
+        Returns the coefficient of determination R^2 of the prediction.
+
+        The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares
+        ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum().
+        The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse).
+        A constant model that always predicts the expected value of y, disregarding the input features, would get a
+        R^2 score of 0.0.
 
+        (NOTE: Comment taken from scikit-learn.)
         :param X:
         :return:
         """
-        # TODO: Fill in logic
+        # TODO:
         pass
 
     # NOTE: These are NOT scikit-learn methods now

From a7804d4d86c85e41c60d730fa7ead177ece95bb2 Mon Sep 17 00:00:00 2001
From: Paul Jacobs <paul.f.jacobs@gmail.com>
Date: Fri, 6 Jul 2018 20:12:32 -0400
Subject: [PATCH 29/51] Populates the score method of the model.

---
 mep/model.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/mep/model.py b/mep/model.py
index 7a3d529..915c998 100644
--- a/mep/model.py
+++ b/mep/model.py
@@ -81,7 +81,7 @@ def predict(self, X):
         """
         return self.best_chromosome.predict(X)
 
-    def score(self, X):
+    def score(self, X, y):
         """
         Returns the coefficient of determination R^2 of the prediction.
 
@@ -92,11 +92,18 @@ def score(self, X):
         R^2 score of 0.0.
 
         (NOTE: Comment taken from scikit-learn.)
-        :param X:
-        :return:
+        :param X: the sample data; matrix with (n_samples, n_features)
+        :type X: np.matrix
+        :param y: the target values
+        :type y: array-like, shape = (n_samples)
+        :return: the score
+        :rtype: float
         """
-        # TODO:
-        pass
+        y_pred = self.predict(X)
+        u = ((y - y_pred) ** 2).sum()
+        v = ((y - y.mean()) ** 2).sum()
+
+        return 1 - u/v
 
     # NOTE: These are NOT scikit-learn methods now
     def to_python(self):

From ecfee48046d626b8cabaf5d051fcde24ac8518a9 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Wed, 17 Oct 2018 17:50:38 +0000
Subject: [PATCH 30/51] a basic test of the model

---
 mep/model.py                     |  8 +++---
 tests/mep/genetics/test_model.py | 47 ++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 4 deletions(-)
 create mode 100644 tests/mep/genetics/test_model.py

diff --git a/mep/model.py b/mep/model.py
index 915c998..a44fa30 100644
--- a/mep/model.py
+++ b/mep/model.py
@@ -41,10 +41,10 @@ def __init__(self, num_constants, constants_min, constants_max, feature_variable
 
     def fit(self, X, y):
         """
-
-        :param X:
-        :param y:
-        :return:
+        Fit the model. Given the feature vectors in matrix 'X' and the target vector 'y' we fit our model.
+        :param X: the feature matrix (training data)
+        :param y: the target values
+        :return: nothing
         """
         # construct a population and run it for the number of generations specified
         population = Population(X, y, self.num_constants,
diff --git a/tests/mep/genetics/test_model.py b/tests/mep/genetics/test_model.py
new file mode 100644
index 0000000..fae9833
--- /dev/null
+++ b/tests/mep/genetics/test_model.py
@@ -0,0 +1,47 @@
+import unittest
+from mep.model import MEPModel
+import random
+import numpy as np
+
+# make reproducible
+random.seed(1)
+
+
+class TestModel(unittest.TestCase):
+    """
+    Test the model.
+    """
+
+    def test_model_basic(self):
+        model = MEPModel(num_constants=2, constants_min=-1, constants_max=1,
+                         feature_variables_probability=0.4, code_length=50,
+                         population_size=100, operators_probability=0.5,
+                         num_generations=200)
+
+        # generate data from this function
+        def function_to_learn(x1, x2):
+            return x1 + x2
+        training_feature_matrix = []
+        training_target_vector = []
+        for sample in range(100):
+            x1, x2 = random.randint(-100, 100), random.randint(-100, 100)
+            val = function_to_learn(x1, x2)
+            training_feature_matrix.append([x1, x2])
+            training_target_vector.append(val)
+
+        # fit the model
+        model.fit(np.matrix(training_feature_matrix), np.array(training_target_vector))
+
+        # test data
+        def function_to_learn(x1, x2):
+            return x1 + x2
+        test_feature_matrix = []
+        test_target_vector = []
+        for sample in range(100):
+            x1, x2 = random.randint(-100, 100), random.randint(-100, 100)
+            val = function_to_learn(x1, x2)
+            test_feature_matrix.append([x1, x2])
+            test_target_vector.append(val)
+
+        self.assertEquals(model.score(np.matrix(training_feature_matrix), np.array(training_target_vector)), 1)
+

From a4209a0d937a1a34ffd4632aac17258eff092417 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Wed, 17 Oct 2018 18:44:50 +0000
Subject: [PATCH 31/51] more complex functions to learn

---
 tests/mep/genetics/test_model.py | 84 ++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 19 deletions(-)

diff --git a/tests/mep/genetics/test_model.py b/tests/mep/genetics/test_model.py
index fae9833..52fc8d2 100644
--- a/tests/mep/genetics/test_model.py
+++ b/tests/mep/genetics/test_model.py
@@ -2,16 +2,42 @@
 from mep.model import MEPModel
 import random
 import numpy as np
+import logging
+import datetime as dt
 
 # make reproducible
 random.seed(1)
 
+logging.basicConfig(filename="logs/TEST_{}.log".format(dt.datetime.now().strftime("%Y%m%d")),
+                    level=logging.DEBUG,
+                    filemode='w',
+                    format="%(asctime)s %(name)s %(funcName)s %(levelname)s %(message)s")
+logger = logging.getLogger("main")
+
 
 class TestModel(unittest.TestCase):
     """
     Test the model.
     """
 
+    def _generate_train_and_test(self, function_to_learn, num_samples, num_args):
+        training_feature_matrix = []
+        training_target_vector = []
+        for sample in range(num_samples):
+            args = [random.randint(-250, 250) for _ in range(num_args)]
+            training_feature_matrix.append(args)
+            training_target_vector.append(function_to_learn(*args))
+
+        test_feature_matrix = []
+        test_target_vector = []
+        for sample in range(num_samples):
+            args = [random.randint(-250, 250) for _ in range(num_args)]
+            test_feature_matrix.append(args)
+            test_target_vector.append(function_to_learn(*args))
+
+        return np.matrix(training_feature_matrix), np.array(training_target_vector), \
+               np.matrix(test_feature_matrix), np.array(test_target_vector)
+
     def test_model_basic(self):
         model = MEPModel(num_constants=2, constants_min=-1, constants_max=1,
                          feature_variables_probability=0.4, code_length=50,
@@ -21,27 +47,47 @@ def test_model_basic(self):
         # generate data from this function
         def function_to_learn(x1, x2):
             return x1 + x2
-        training_feature_matrix = []
-        training_target_vector = []
-        for sample in range(100):
-            x1, x2 = random.randint(-100, 100), random.randint(-100, 100)
-            val = function_to_learn(x1, x2)
-            training_feature_matrix.append([x1, x2])
-            training_target_vector.append(val)
+
+        training_feature_matrix, training_target_vector, test_feature_matrix, test_target_vector = self._generate_train_and_test(
+            function_to_learn, 100, 2)
 
         # fit the model
-        model.fit(np.matrix(training_feature_matrix), np.array(training_target_vector))
+        model.fit(training_feature_matrix, training_target_vector)
 
-        # test data
-        def function_to_learn(x1, x2):
-            return x1 + x2
-        test_feature_matrix = []
-        test_target_vector = []
-        for sample in range(100):
-            x1, x2 = random.randint(-100, 100), random.randint(-100, 100)
-            val = function_to_learn(x1, x2)
-            test_feature_matrix.append([x1, x2])
-            test_target_vector.append(val)
+        self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1)
+
+    def test_model_min_max(self):
+        model = MEPModel(num_constants=2, constants_min=-1, constants_max=1,
+                         feature_variables_probability=0.4, code_length=50,
+                         population_size=100, operators_probability=0.7,
+                         num_generations=200)
+
+        # generate data from this function
+        def function_to_learn(x1, x2, x3, x4):
+            return min(x1, x2) + max(x3, x4)
+
+        training_feature_matrix, training_target_vector, test_feature_matrix, test_target_vector = self._generate_train_and_test(
+            function_to_learn, 100, 4)
 
-        self.assertEquals(model.score(np.matrix(training_feature_matrix), np.array(training_target_vector)), 1)
+        # fit the model
+        model.fit(training_feature_matrix, training_target_vector)
+
+        self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1)
+
+    def test_model_pow(self):
+        model = MEPModel(num_constants=2, constants_min=-1, constants_max=1,
+                         feature_variables_probability=0.4, code_length=50,
+                         population_size=100, operators_probability=0.5,
+                         num_generations=200)
+
+        # generate data from this function
+        def function_to_learn(x1, x2, x3, x4):
+            return x1 * x2 + x2 * x2 + x3
+
+        training_feature_matrix, training_target_vector, test_feature_matrix, test_target_vector = self._generate_train_and_test(
+            function_to_learn, 100, 4)
+
+        # fit the model
+        model.fit(training_feature_matrix, training_target_vector)
 
+        self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1)

From 98858b0d5e012779223b260dd737d5967876321d Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 18:50:04 +0000
Subject: [PATCH 32/51] requirements file

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..70f1d2f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+numpy==1.17
+pandas==0.25
\ No newline at end of file

From a029a0811f065878045a41392c744e332641c6a3 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 18:50:29 +0000
Subject: [PATCH 33/51] fix import

---
 mep/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mep/main.py b/mep/main.py
index d91913f..5c04b1e 100644
--- a/mep/main.py
+++ b/mep/main.py
@@ -3,7 +3,7 @@
 import json
 import logging
 import os
-from dataset import DataSet
+from mep.dataset import DataSet
 from mep.model import MEPModel
 
 if __name__ == "__main__":

From a0dbf2e4ea473651996b45006b61e6080f69d22a Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 18:53:43 +0000
Subject: [PATCH 34/51] pip approach

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 64312ca..9486b32 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,13 @@ conda env create -f environment.yaml
 source activate py-mep-dev
 ```
 
+Or using `pip` we could do:
+
+```
+virtualenv -p python3 .venv
+pip install -r requirements.txt
+```
+
 Example, running with a dataset `python -m mep.main datasets/data1.csv test.py`. This will run a full MEP population evolution to solve the problem specified in the data CSV, determine the best chromosome, prune it, and then convert that chromosome into a functioning python program that can be run by passing in the feature inputs. Example, `python test.py 5 10`.
 
 An example Python program evolved to solve the addition problem of adding together two features (ex: datasets/data1.csv):

From 81411cbed7662ad1c63913b242aeeb65d0229bbe Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:21:27 +0000
Subject: [PATCH 35/51] drop old fashioned (object) inheritance

---
 mep/dataset.py             | 2 +-
 mep/genetics/chromosome.py | 2 +-
 mep/genetics/gene.py       | 2 +-
 mep/genetics/operator.py   | 2 +-
 mep/genetics/population.py | 2 +-
 mep/model.py               | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mep/dataset.py b/mep/dataset.py
index 4f09241..2962c0d 100644
--- a/mep/dataset.py
+++ b/mep/dataset.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
 
-class DataSet(object):
+class DataSet:
     """
     Encapsulate a data set. Feature vectors and their targets.
     """
diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index 83e924f..5e8e37d 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -7,7 +7,7 @@
 from random import random, randint, choice
 
 
-class Chromosome(object):
+class Chromosome:
     """
     Level above Gene. Each chromosome is a fixed number of genes and constants. We can think of a chromosome as a
     program where each gene is a line of code in the program. Genes can reference the result of other genes by their
diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py
index fa9d12d..4672eed 100644
--- a/mep/genetics/gene.py
+++ b/mep/genetics/gene.py
@@ -118,7 +118,7 @@ def __eq__(self, other):
         return self.index == other.index and self.is_feature == other.is_feature
 
 
-class OperatorGene(object):
+class OperatorGene:
     """
     This gene performance an operation on two addresses. The addresses are indices in the eval_matrix -- i.e. from the
     evaluation of other genes before this one.
diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py
index 4c4e0fd..16f38be 100644
--- a/mep/genetics/operator.py
+++ b/mep/genetics/operator.py
@@ -2,7 +2,7 @@
 
 
 # TODO: add some more interesting operators; example pow(...), log(...), exp(...)
-class Operator(object):
+class Operator:
     """
     This is more of a function than a traditional "operator" but the function could be simply using an operator
     like "+", "-", etc. At it's core these are indivisible functions that take arguments (i.e. preceding genes) and
diff --git a/mep/genetics/population.py b/mep/genetics/population.py
index dd60592..97c4d8d 100644
--- a/mep/genetics/population.py
+++ b/mep/genetics/population.py
@@ -3,7 +3,7 @@
 import copy
 
 
-class Population(object):
+class Population:
     """
     A collection of chromosomes.
     """
diff --git a/mep/model.py b/mep/model.py
index a44fa30..6cbc3f3 100644
--- a/mep/model.py
+++ b/mep/model.py
@@ -4,7 +4,7 @@
 
 # NOTE: The idea is to explicitly conform to a scikit-learn type of approach where we can run fit(..) and
 # predict(..) methods on the model
-class MEPModel(object):
+class MEPModel:
     """
     Encapsulate the MEP model.
     """

From 9c5ebf8e3880cb3c8266eeef302f7a13f9b9a223 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:28:59 +0000
Subject: [PATCH 36/51] type hinting

---
 mep/model.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/mep/model.py b/mep/model.py
index 6cbc3f3..52fbf7b 100644
--- a/mep/model.py
+++ b/mep/model.py
@@ -1,4 +1,5 @@
 import logging
+import numpy as np
 from mep.genetics.population import Population
 
 
@@ -9,8 +10,9 @@ class MEPModel:
     Encapsulate the MEP model.
     """
 
-    def __init__(self, num_constants, constants_min, constants_max, feature_variables_probability, code_length,
-                 population_size, operators_probability, num_generations):
+    def __init__(self, num_constants: int, constants_min: float, constants_max: float,
+                 feature_variables_probability: float, code_length: int, population_size: int,
+                 operators_probability: float, num_generations: int):
 
         """
         Initialize.
@@ -39,7 +41,7 @@ def __init__(self, num_constants, constants_min, constants_max, feature_variable
         # the best found chromosome from the evolution process
         self.best_chromosome = None
 
-    def fit(self, X, y):
+    def fit(self, X: np.ndarray, y: np.ndarray):
         """
         Fit the model. Given the feature vectors in matrix 'X' and the target vector 'y' we fit our model.
         :param X: the feature matrix (training data)
@@ -71,17 +73,15 @@ def fit(self, X, y):
         # prune out the unused genes
         self.best_chromosome.prune()
 
-    def predict(self, X):
+    def predict(self, X: np.ndarray) -> np.ndarray:
         """
         Return the predictions for this data.
         :param X: the sample data; matrix with (n_samples, n_features)
-        :type X: np.matrix
         :return: the prediction for each sample; array-like (n_samples) length
-        :rtype: np.array
         """
         return self.best_chromosome.predict(X)
 
-    def score(self, X, y):
+    def score(self, X: np.ndarray, y: np.ndarray) -> float:
         """
         Returns the coefficient of determination R^2 of the prediction.
 
@@ -93,11 +93,9 @@ def score(self, X, y):
 
         (NOTE: Comment taken from scikit-learn.)
         :param X: the sample data; matrix with (n_samples, n_features)
-        :type X: np.matrix
         :param y: the target values
         :type y: array-like, shape = (n_samples)
         :return: the score
-        :rtype: float
         """
         y_pred = self.predict(X)
         u = ((y - y_pred) ** 2).sum()

From 837302afb64a27df64b5c86891f231b93cc3b1a8 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:29:18 +0000
Subject: [PATCH 37/51] use a consistent output log dir

---
 tests/mep/genetics/test_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mep/genetics/test_model.py b/tests/mep/genetics/test_model.py
index 52fc8d2..2859262 100644
--- a/tests/mep/genetics/test_model.py
+++ b/tests/mep/genetics/test_model.py
@@ -8,7 +8,7 @@
 # make reproducible
 random.seed(1)
 
-logging.basicConfig(filename="logs/TEST_{}.log".format(dt.datetime.now().strftime("%Y%m%d")),
+logging.basicConfig(filename="output_logs/TEST_{}.log".format(dt.datetime.now().strftime("%Y%m%d")),
                     level=logging.DEBUG,
                     filemode='w',
                     format="%(asctime)s %(name)s %(funcName)s %(levelname)s %(message)s")

From 09a092e2e371237d9e3edbbe6584c33403f017a3 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:29:41 +0000
Subject: [PATCH 38/51] pytest is needed for the ./test.sh

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 70f1d2f..3fa8759 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 numpy==1.17
-pandas==0.25
\ No newline at end of file
+pandas==0.25
+pytest==5.2.1
\ No newline at end of file

From 7ea7e5ec313ae659a82fef0705de8655f2ad453d Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:30:58 +0000
Subject: [PATCH 39/51] explicit reference to the test dir

---
 test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test.sh b/test.sh
index be79cbe..cd7b35b 100755
--- a/test.sh
+++ b/test.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-py.test
\ No newline at end of file
+py.test tests/

From 8802f43d553789651c1c8e5164991568a3e3bc6b Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:34:56 +0000
Subject: [PATCH 40/51] use python 3+ abstract classes and genes need to
 inherit Gene

---
 mep/genetics/gene.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py
index 4672eed..7e07d87 100644
--- a/mep/genetics/gene.py
+++ b/mep/genetics/gene.py
@@ -3,11 +3,10 @@
 from abc import ABCMeta, abstractmethod
 
 
-class Gene(object):
+class Gene(metaclass=ABCMeta):
     """
     Lowest level of the genetic structure of MEP. Think of this as one line of code in the program.
     """
-    __metaclass__ = ABCMeta
 
     @abstractmethod
     def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
@@ -35,7 +34,7 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
 # a crossover of the whole chromosome with a new random chromosome, I don't think there is any benefit.
 
 
-class VariableGene(object):
+class VariableGene(Gene):
     """
     This gene is simply a variable. Either a constant or one of the features in the data -- i.e. an input variable.
     """
@@ -118,7 +117,7 @@ def __eq__(self, other):
         return self.index == other.index and self.is_feature == other.is_feature
 
 
-class OperatorGene:
+class OperatorGene(Gene):
     """
     This gene performance an operation on two addresses. The addresses are indices in the eval_matrix -- i.e. from the
     evaluation of other genes before this one.

From 348d31cbecefef656ec19035b7caa5f19064d989 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:43:25 +0000
Subject: [PATCH 41/51] use newer abstract class mechanism, better str rep, and
 type hinting

---
 mep/genetics/gene.py     | 17 +++++++----------
 mep/genetics/operator.py |  9 +++++++--
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py
index 7e07d87..0309e09 100644
--- a/mep/genetics/gene.py
+++ b/mep/genetics/gene.py
@@ -2,6 +2,8 @@
 import numpy as np
 from abc import ABCMeta, abstractmethod
 
+from mep.genetics.operator import Operator
+
 
 class Gene(metaclass=ABCMeta):
     """
@@ -9,7 +11,7 @@ class Gene(metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
+    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> float:
         """
         This method will modify the eval_matrix for this gene index for each example in the data_matrix.
 
@@ -39,20 +41,18 @@ class VariableGene(Gene):
     This gene is simply a variable. Either a constant or one of the features in the data -- i.e. an input variable.
     """
 
-    def __init__(self, index, is_feature=True):
+    def __init__(self, index: int, is_feature=True):
         """
         The index into either the feature vector (if "is_feature" is True) or into the constants.
         :param index: the index into the vector
-        :type index: int
         :param is_feature: whether this is a feature variable or a constant
-        :type is_feature: bool
         """
         # self.logger = logging.getLogger(self.__class__)
 
         self.index = index
         self.is_feature = is_feature
 
-    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
+    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> float:
         """
         This method will modify the eval_matrix for this gene index for each example in the data_matrix.
 
@@ -124,15 +124,12 @@ class OperatorGene(Gene):
     """
 
     # NOTE: This could be expanded to multiple addresses
-    def __init__(self, operation, address1, address2):
+    def __init__(self, operation: Operator, address1: int, address2: int):
         """
         Initialize.
         :param operation: a lambda or function that can be operated on two floats
-        :type operation: lambda
         :param address1: index into the eval_matrix
-        :type address1: int
         :param address2: index into the eval_matrix
-        :type address2: int
         """
         # self.logger = logging.getLogger(self.__class__)
 
@@ -140,7 +137,7 @@ def __init__(self, operation, address1, address2):
         self.address1 = address1
         self.address2 = address2
 
-    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets):
+    def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> float:
         """
         This method will modify the eval_matrix for this gene index for each example in the data_matrix.
 
diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py
index 16f38be..5790c1a 100644
--- a/mep/genetics/operator.py
+++ b/mep/genetics/operator.py
@@ -2,13 +2,12 @@
 
 
 # TODO: add some more interesting operators; example pow(...), log(...), exp(...)
-class Operator:
+class Operator(metaclass=ABCMeta):
     """
     This is more of a function than a traditional "operator" but the function could be simply using an operator
     like "+", "-", etc. At it's core these are indivisible functions that take arguments (i.e. preceding genes) and
     output some value.
     """
-    __metaclass__ = ABCMeta
 
     @abstractmethod
     def __call__(self, *args, **kwargs):
@@ -28,6 +27,12 @@ def function_python_definition(self):
         Return the python definition of the function
         """
 
+    def __str__(self):
+        return self.function_name()
+
+    def __repr__(self):
+        return str(self)
+
 
 # TODO: Consolidate these into just one Operator?
 class AdditionOperator(Operator):

From 1f2c175d00729902a953513436879b08a0e3baa3 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:44:28 +0000
Subject: [PATCH 42/51] test must have broken with upgrade (change in random
 seed?) so this fixes it

---
 tests/mep/genetics/test_chromosome.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py
index ac458f7..ba093eb 100644
--- a/tests/mep/genetics/test_chromosome.py
+++ b/tests/mep/genetics/test_chromosome.py
@@ -43,17 +43,17 @@ def test_basic_random_construction(self):
                                                            operators_prob=0.5)
 
         # confirm the number of genes and constants match what we expect
-        self.assertEquals(num_genes, len(chromosome.genes))
-        self.assertEquals(num_constants, len(chromosome.constants))
+        self.assertEqual(num_genes, len(chromosome.genes))
+        self.assertEqual(num_constants, len(chromosome.constants))
 
         # the first gene has to be a variable gene; in particular it is this one
-        self.assertEquals(VariableGene(0, is_feature=False), chromosome.genes[0])
+        self.assertEqual(VariableGene(0, is_feature=False), chromosome.genes[0])
 
         # the 2nd gene can be a variable or an operator; in this case it is the below
-        self.assertEquals(OperatorGene(Chromosome.operators_family[2](), 0, 0), chromosome.genes[1])
+        self.assertEqual(OperatorGene(Chromosome.operators_family[4](), 0, 0), chromosome.genes[1])
 
         # verify constant
-        self.assertAlmostEquals(8.599796663725433, chromosome.constants[0])
+        self.assertAlmostEqual(8.599796663725433, chromosome.constants[0])
 
     def test_evaluate(self):
         """

From a453564cb9a7bf199c5439bd9bd0902227c065a0 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:46:27 +0000
Subject: [PATCH 43/51] drop depreciated assetEqual for assertEquals

---
 tests/mep/genetics/test_gene.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mep/genetics/test_gene.py b/tests/mep/genetics/test_gene.py
index 7b62d12..26c4b8c 100644
--- a/tests/mep/genetics/test_gene.py
+++ b/tests/mep/genetics/test_gene.py
@@ -34,7 +34,7 @@ def test_basic_constant(self):
         # run the evaluate
         error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets)
         self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix))
-        self.assertEquals((1. - 0) + (1. - 0), error)
+        self.assertEqual((1. - 0) + (1. - 0), error)
 
     def test_basic_feature_gene(self):
         """
@@ -66,7 +66,7 @@ def test_basic_feature_gene(self):
         # run the evaluate
         error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets)
         self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix))
-        self.assertEquals((5. - 0.) + (7. - 0.), error)
+        self.assertEqual((5. - 0.) + (7. - 0.), error)
 
     def test_constant_and_feature_gene(self):
         """

From ffd2a01ce1e4d9f9443611bb87397bff8804cd9e Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 19:55:11 +0000
Subject: [PATCH 44/51] depreciated matrix infavor of array in newer numpy

---
 tests/mep/genetics/test_gene.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/mep/genetics/test_gene.py b/tests/mep/genetics/test_gene.py
index 26c4b8c..037ef00 100644
--- a/tests/mep/genetics/test_gene.py
+++ b/tests/mep/genetics/test_gene.py
@@ -29,7 +29,7 @@ def test_basic_constant(self):
 
         # expected; only one gene and it is going to be using the first constant;
         gene_index = 0
-        expected_eval_matrix = np.matrix([[constants[constant_index], constants[constant_index]]])
+        expected_eval_matrix = np.array([[constants[constant_index], constants[constant_index]]])
 
         # run the evaluate
         error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets)
@@ -61,7 +61,7 @@ def test_basic_feature_gene(self):
 
         # expected; only one gene and it is going to be using the first constant;
         gene_index = 0
-        expected_eval_matrix = np.matrix([[data_matrix[0, feature_index], data_matrix[1, feature_index]]])
+        expected_eval_matrix = np.array([[data_matrix[0, feature_index], data_matrix[1, feature_index]]])
 
         # run the evaluate
         error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets)
@@ -94,7 +94,7 @@ def test_constant_and_feature_gene(self):
         data_matrix[1, feature_index] = 7.
 
         # expected;
-        expected_eval_matrix = np.matrix([[data_matrix[0, feature_index], data_matrix[1, feature_index]],
+        expected_eval_matrix = np.array([[data_matrix[0, feature_index], data_matrix[1, feature_index]],
                                           [constants[constant_index], constants[constant_index]]])
 
         # run the evaluate
@@ -127,8 +127,7 @@ def test_operator_gene_basic(self):
         eval_matrix[0, 0] = 2
 
         # expected; first gene is unchanged; the 2nd one is the sum of the first with itself (i.e. 4)
-        expected_eval_matrix = np.matrix([[2],
-                                          [4]])
+        expected_eval_matrix = np.array([[2], [4]])
 
         # run the evaluate
         error = gene.evaluate(1, eval_matrix, data_matrix, constants, targets)

From e0502794f454197b064466399ecdc8665c8761df Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Thu, 24 Oct 2019 20:00:34 +0000
Subject: [PATCH 45/51] assertEqual in test model

---
 mep/genetics/gene.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py
index 0309e09..aa6d3c3 100644
--- a/mep/genetics/gene.py
+++ b/mep/genetics/gene.py
@@ -1,4 +1,6 @@
 import logging
+from typing import Union, Callable
+
 import numpy as np
 from abc import ABCMeta, abstractmethod
 
@@ -124,7 +126,7 @@ class OperatorGene(Gene):
     """
 
     # NOTE: This could be expanded to multiple addresses
-    def __init__(self, operation: Operator, address1: int, address2: int):
+    def __init__(self, operation: Union[Callable, Operator], address1: int, address2: int):
         """
         Initialize.
         :param operation: a lambda or function that can be operated on two floats

From b1f3244ecfe1001da3faac6433bd67402c91c9bb Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Fri, 25 Oct 2019 16:18:22 +0000
Subject: [PATCH 46/51] depreciated Equals to Equal

---
 tests/mep/genetics/test_model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/mep/genetics/test_model.py b/tests/mep/genetics/test_model.py
index 2859262..f42d52c 100644
--- a/tests/mep/genetics/test_model.py
+++ b/tests/mep/genetics/test_model.py
@@ -54,7 +54,7 @@ def function_to_learn(x1, x2):
         # fit the model
         model.fit(training_feature_matrix, training_target_vector)
 
-        self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1)
+        self.assertEqual(model.score(test_feature_matrix, test_target_vector), 1)
 
     def test_model_min_max(self):
         model = MEPModel(num_constants=2, constants_min=-1, constants_max=1,
@@ -72,7 +72,7 @@ def function_to_learn(x1, x2, x3, x4):
         # fit the model
         model.fit(training_feature_matrix, training_target_vector)
 
-        self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1)
+        self.assertEqual(model.score(test_feature_matrix, test_target_vector), 1)
 
     def test_model_pow(self):
         model = MEPModel(num_constants=2, constants_min=-1, constants_max=1,
@@ -90,4 +90,4 @@ def function_to_learn(x1, x2, x3, x4):
         # fit the model
         model.fit(training_feature_matrix, training_target_vector)
 
-        self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1)
+        self.assertEqual(model.score(test_feature_matrix, test_target_vector), 1)

From 9f6a424108345834c528cc44356a7f10abbdd3dd Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Fri, 25 Oct 2019 16:42:32 +0000
Subject: [PATCH 47/51] use arg parse

---
 mep/main.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/mep/main.py b/mep/main.py
index 5c04b1e..d3ab6f3 100644
--- a/mep/main.py
+++ b/mep/main.py
@@ -1,3 +1,4 @@
+import argparse
 import sys
 import datetime as dt
 import json
@@ -7,17 +8,15 @@
 from mep.model import MEPModel
 
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print("ERROR: Expected usage 'python -m mep.main DATA_SET_NAME PYTHON_FILE_NAME'\n" +
-              "     DATA_SET_NAME:    The name (full path) to the data file to train on.\n"
-              "     PYTHON_FILE_NAME: The name (full path) to the python file to write the output program.\n"
-              "Example: 'python -m mep.main datasets/data1.csv test.py'"
-              )
-        sys.exit(-1)
+    parser = argparse.ArgumentParser(
+        description="Run the MEP model.\nExample: 'python -m mep.main datasets/data1.csv test.py", allow_abbrev=False)
+    parser.add_argument("data_set_name", help="The name (full path) to the data file to train on.")
+    parser.add_argument("python_file_name", help="The name (full path) to the python file to write the output program.")
+    args = parser.parse_args()
 
     # get the data file
-    data_set_name = sys.argv[1]
-    python_file_name = sys.argv[2]
+    data_set_name = args.data_set_name
+    python_file_name = args.python_file_name
     data_set = DataSet(data_set_name)
 
     # read config file
@@ -46,13 +45,13 @@
     model.fit(data_set.data_matrix, data_set.target)
     logger.info("Finished fitting the model")
 
-    # TODO: Optional?
     # we then convert the chromosome into a valid python program and write it out to file
-    with open(python_file_name, 'w') as python_file:
+    if python_file_name:
         python_program = model.to_python()
-        logger.debug("Write out the python program to {}".format(python_file_name))
-        logger.debug(python_program)
-        python_file.write(python_program)
+        with open(python_file_name, 'w') as python_file:
+            logger.debug("Write out the python program to {}".format(python_file_name))
+            logger.debug(python_program)
+            python_file.write(python_program)
 
     # TODO: Add support for classification
     # TODO: Add example digital circuit test

From fe579aa65885f7a85b78859e004dbaf8a85ed57d Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Mon, 28 Oct 2019 21:12:37 +0000
Subject: [PATCH 48/51] print the pruned chromosome too

---
 mep/genetics/chromosome.py | 7 ++++---
 mep/model.py               | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index 5e8e37d..a38a3e5 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -196,10 +196,11 @@ def pretty_string(self, stop_at_best=True):
 
         # now show each gene on a separate line
         for gene_index, gene in enumerate(self.genes):
-            gene_str = gene.__str__()
-            if type(gene) == VariableGene:
+            gene_str = str(gene)
+            if isinstance(gene, VariableGene):
                 gene_str = gene.pretty_string()
-            elif type(gene) == OperatorGene:
+            elif isinstance(gene, OperatorGene):
+                # TODO: why not push to the gene?
                 gene_str = "{}(PROGRAM[{}], PROGRAM[{}])".format(gene.operation.function_name(),
                                                                  gene.address1, gene.address2)
             program += "{}:{}\n".format(gene_index, gene_str)
diff --git a/mep/model.py b/mep/model.py
index 52fbf7b..60b34f6 100644
--- a/mep/model.py
+++ b/mep/model.py
@@ -73,6 +73,8 @@ def fit(self, X: np.ndarray, y: np.ndarray):
         # prune out the unused genes
         self.best_chromosome.prune()
 
+        self.logger.debug("Pruned chromosome (pretty)\n {}".format(self.best_chromosome.pretty_string()))
+
     def predict(self, X: np.ndarray) -> np.ndarray:
         """
         Return the predictions for this data.

From 5dfdb83c206252f6cbe48e5a27a4cd1aeecc50eb Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Mon, 28 Oct 2019 21:15:58 +0000
Subject: [PATCH 49/51] use isinstance(..)

---
 mep/genetics/chromosome.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index a38a3e5..b08642d 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -216,6 +216,9 @@ def prune(self):
         Trim out the unused genes. NOTE: This "breaks" the chromosomes as it is going to change how many genes are
         in the program. Only do this once we have finished evolving the program.
         """
+
+        # TODO: drop genes which do nothing; ex: min(x[0], x[0]) or max(x[0], x[0])
+
         # the best gene index is going to be the last line of the program; since the genes never reference genes
         # beyond it then we just proceed back to the top and remove any which haven't been referenced; we determine
         # this via a BFS type search
@@ -238,7 +241,7 @@ def prune(self):
 
             # check the addresses on the gene if it is an operator
             gene = self.genes[gene_index]
-            if type(gene) == OperatorGene:
+            if isinstance(gene, OperatorGene):
                 genes_indices_to_visit.appendleft(gene.address1)
                 genes_indices_to_visit.appendleft(gene.address2)
                 gene_indices_in_use.add(gene.address1)
@@ -255,7 +258,7 @@ def prune(self):
         # TODO: This could be done in the list comprehension but it is clearer to just do another pass
         # re-map the address to the new index
         for gene in self.genes:
-            if type(gene) == OperatorGene:
+            if isinstance(gene, OperatorGene):
                 gene.address1 = gene_indices_in_use.index(gene.address1)
                 gene.address2 = gene_indices_in_use.index(gene.address2)
 
@@ -295,12 +298,12 @@ def to_python(self):
         genes_str = "program = [0] * {}\n".format(len(self.genes))
         for gene_index, gene in enumerate(self.genes):
             genes_str += "    program[{}] = ".format(gene_index)
-            if type(gene) == VariableGene:
+            if isinstance(gene, VariableGene):
                 if gene.is_feature:
                     genes_str += "float(sys.argv[{}])".format(gene.index + 1)
                 else:
                     genes_str += "constants[{}]".format(gene.index)
-            elif type(gene) == OperatorGene:
+            elif isinstance(gene, OperatorGene):
                 genes_str += "{}(program[{}], program[{}])".format(gene.operation.function_name(),
                                                                    gene.address1, gene.address2)
             genes_str += "\n"

From dbd710e0feb86c707da063682a487b9057249d98 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Mon, 28 Oct 2019 21:44:40 +0000
Subject: [PATCH 50/51] push pretty string into the gene

---
 mep/genetics/chromosome.py |  9 +--------
 mep/genetics/gene.py       | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index b08642d..1762b72 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -196,14 +196,7 @@ def pretty_string(self, stop_at_best=True):
 
         # now show each gene on a separate line
         for gene_index, gene in enumerate(self.genes):
-            gene_str = str(gene)
-            if isinstance(gene, VariableGene):
-                gene_str = gene.pretty_string()
-            elif isinstance(gene, OperatorGene):
-                # TODO: why not push to the gene?
-                gene_str = "{}(PROGRAM[{}], PROGRAM[{}])".format(gene.operation.function_name(),
-                                                                 gene.address1, gene.address2)
-            program += "{}:{}\n".format(gene_index, gene_str)
+            program += "{}:{}\n".format(gene_index, gene.pretty_string())
 
             if self.best_gene_index == gene_index and stop_at_best:
                 return program
diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py
index aa6d3c3..5246355 100644
--- a/mep/genetics/gene.py
+++ b/mep/genetics/gene.py
@@ -32,7 +32,11 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) ->
         :return: error (sum of error across the examples); modifies the eval_matrix
         :rtype: float
         """
+        raise NotImplementedError()
 
+    @abstractmethod
+    def pretty_string(self) -> str:
+        raise NotImplementedError()
 
 # NOTE: Should we also add a mutate method to the gene itself? Considering that we are doing the mutation by doing
 # a crossover of the whole chromosome with a new random chromosome, I don't think there is any benefit.
@@ -99,11 +103,10 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) ->
     def __str__(self):
         return "VariableGene({}, is_feature={})".format(self.index, self.is_feature)
 
-    def pretty_string(self):
+    def pretty_string(self) -> str:
         """
         Pretty program string version.
         :return: string version
-        :rtype: str
         """
         if self.is_feature:
             return "FEATURES[{}]".format(self.index)
@@ -187,3 +190,10 @@ def __eq__(self, other):
 
         # NOTE: the operators are the same if they are of the same type
         return isinstance(self.operation, type(other.operation)) and self.address1 == other.address1 and self.address2 == other.address2
+
+    def pretty_string(self) -> str:
+        """
+        Pretty program string version.
+        :return: string version
+        """
+        return "{}(PROGRAM[{}], PROGRAM[{}])".format(self.operation.function_name(), self.address1, self.address2)

From 84bf2624d5de03176d07a02eec1b383fc9589123 Mon Sep 17 00:00:00 2001
From: pjacobs <paul@kbit.com>
Date: Tue, 29 Oct 2019 21:16:37 +0000
Subject: [PATCH 51/51] division operator

---
 mep/genetics/chromosome.py |  6 ++++--
 mep/genetics/operator.py   | 31 +++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py
index 1762b72..d669a15 100644
--- a/mep/genetics/chromosome.py
+++ b/mep/genetics/chromosome.py
@@ -2,7 +2,7 @@
 import numpy as np
 from collections import deque
 from mep.genetics.gene import Gene, VariableGene, OperatorGene
-from mep.genetics.operator import AdditionOperator, MultiplicationOperator, SubtractionOperator
+from mep.genetics.operator import AdditionOperator, MultiplicationOperator, SubtractionOperator, DivisionOperator
 from mep.genetics.operator import MinOperator, MaxOperator
 from random import random, randint, choice
 
@@ -19,7 +19,9 @@ class Chromosome:
                         MultiplicationOperator,
                         SubtractionOperator,
                         MinOperator,
-                        MaxOperator]
+                        MaxOperator,
+                        DivisionOperator
+                        ]
 
     def __init__(self, genes, constants):
         """
diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py
index 5790c1a..347c07b 100644
--- a/mep/genetics/operator.py
+++ b/mep/genetics/operator.py
@@ -1,3 +1,6 @@
+import math
+import traceback
+import numpy as np
 from abc import ABCMeta, abstractmethod
 
 
@@ -148,4 +151,32 @@ def function_python_definition(self):
         return """
 def max_(x, y):
     return max(x, y)
+        """
+
+
+class DivisionOperator(Operator):
+    """
+    Perform the division operation.
+    """
+
+    def __call__(self, *args, **kwargs):
+        """
+        Perform the operation
+        """
+        if len(args) != 2:
+            raise RuntimeError("Pow operator needs just two arguments")
+        x = args[0]
+        y = args[1]
+
+        # looping back around division
+        # NOTE: this is pretty weird but
+        return x / y if y != 0 else 0
+
+    def function_name(self):
+        return "division_"
+
+    def function_python_definition(self):
+        return """
+def division_(x, y):
+    return x / y if y != 0 else 0
         """
\ No newline at end of file