From 585061ec31f84051bafa662909b4a061a22595d7 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Fri, 31 Mar 2017 20:51:21 -0400 Subject: [PATCH 01/51] Put the targets into the gene evaluate(..). --- mep/genetics/gene.py | 48 ++++++++++++++++++++++++++------- tests/mep/genetics/test_gene.py | 16 +++++++---- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index a00c8bd..14ce831 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -10,7 +10,7 @@ class Gene(object): __metaclass__ = ABCMeta @abstractmethod - def evaluate(self, gene_index, eval_matrix, data_matrix, constants): + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): """ This method will modify the eval_matrix for this gene index for each example in the data_matrix. @@ -24,7 +24,10 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants): :type data_matrix: np.matrix :param constants: the constants associated with this chromosome :type constants: list - :return: nothing; modifies the eval_matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + :return: error (sum of error across the examples); modifies the eval_matrix + :rtype: float """ @@ -49,7 +52,7 @@ def __init__(self, index, is_feature=True): self.index = index self.is_feature = is_feature - def evaluate(self, gene_index, eval_matrix, data_matrix, constants): + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): """ This method will modify the eval_matrix for this gene index for each example in the data_matrix. @@ -64,18 +67,32 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants): :type data_matrix: np.matrix :param constants: the constants associated with this chromosome :type constants: list - :return: nothing; modifies the eval_matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + :return: error (sum of error); modifies the eval_matrix + :rtype: float """ + # TODO: Move common logic up + # TODO: Handle classification as well as regression + # go through and set the data num_examples = eval_matrix.shape[1] + sum_of_errors = 0. for example_index in range(0, num_examples): # each column is one example in the data matrix (i.e. one feature vector) # if we are a feature variable then we look at the corresponding feature in the feature vector for this # example; otherwise (as a constant) we just go to that (independent of the example we are in) if self.is_feature: - eval_matrix[gene_index, example_index] = data_matrix[example_index, self.index] + value = data_matrix[example_index, self.index] else: - eval_matrix[gene_index, example_index] = constants[self.index] + value = constants[self.index] + # calculate error + sum_of_errors += abs(targets[example_index] - value) + + # set it in the eval matrix + eval_matrix[gene_index, example_index] = value + + return sum_of_errors def __str__(self): return "VariableGene({}, is_feature={})".format(self.index, self.is_feature) @@ -112,7 +129,7 @@ def __init__(self, operation, address1, address2): self.address1 = address1 self.address2 = address2 - def evaluate(self, gene_index, eval_matrix, data_matrix, constants): + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): """ This method will modify the eval_matrix for this gene index for each example in the data_matrix. @@ -126,16 +143,27 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants): :type data_matrix: np.matrix :param constants: the constants associated with this chromosome :type constants: list - :return: nothing; modifies the eval_matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + :return: error (sum of error); modifies the eval_matrix + :rtype: float + """ # go through and set the data num_examples = eval_matrix.shape[1] + sum_of_errors = 0. for example_index in range(0, num_examples): # each column is one example in the data matrix (i.e. one feature vector) # TODO: Catch errors; in particular division can be a problem - eval_matrix[gene_index, example_index] = self.operation(eval_matrix[self.address1][example_index], - eval_matrix[self.address2][example_index]) + value = self.operation(eval_matrix[self.address1][example_index], + eval_matrix[self.address2][example_index]) + # set it in the eval matrix + eval_matrix[gene_index, example_index] = value + + sum_of_errors += abs(targets[example_index] - value) + + return sum_of_errors def __str__(self): return "OperatorGene({}, {}, {})".format(self.operation, self.address1, self.address2) diff --git a/tests/mep/genetics/test_gene.py b/tests/mep/genetics/test_gene.py index 40aa521..7b62d12 100644 --- a/tests/mep/genetics/test_gene.py +++ b/tests/mep/genetics/test_gene.py @@ -25,14 +25,16 @@ def test_basic_constant(self): constants = [1., 2.] eval_matrix = np.zeros((num_genes, num_examples)) data_matrix = np.zeros((num_examples, num_features)) + targets = [0] * num_examples # expected; only one gene and it is going to be using the first constant; gene_index = 0 expected_eval_matrix = np.matrix([[constants[constant_index], constants[constant_index]]]) # run the evaluate - gene.evaluate(gene_index, eval_matrix, data_matrix, constants) + error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) + self.assertEquals((1. - 0) + (1. - 0), error) def test_basic_feature_gene(self): """ @@ -51,6 +53,7 @@ def test_basic_feature_gene(self): constants = [1., 2.] eval_matrix = np.zeros((num_genes, num_examples)) data_matrix = np.zeros((num_examples, num_features)) + targets = [0] * num_examples # set the data matrix for the feature that we care about data_matrix[0, feature_index] = 5. @@ -61,8 +64,9 @@ def test_basic_feature_gene(self): expected_eval_matrix = np.matrix([[data_matrix[0, feature_index], data_matrix[1, feature_index]]]) # run the evaluate - gene.evaluate(gene_index, eval_matrix, data_matrix, constants) + error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) + self.assertEquals((5. - 0.) + (7. - 0.), error) def test_constant_and_feature_gene(self): """ @@ -83,6 +87,7 @@ def test_constant_and_feature_gene(self): constants = [1., 2.] eval_matrix = np.zeros((num_genes, num_examples)) data_matrix = np.zeros((num_examples, num_features)) + targets = [0] * num_examples # set the data matrix for the feature that we care about data_matrix[0, feature_index] = 5. @@ -93,8 +98,8 @@ def test_constant_and_feature_gene(self): [constants[constant_index], constants[constant_index]]]) # run the evaluate - feature_gene.evaluate(0, eval_matrix, data_matrix, constants) - constant_gene.evaluate(1, eval_matrix, data_matrix, constants) + feature_error = feature_gene.evaluate(0, eval_matrix, data_matrix, constants, targets) + constant_error = constant_gene.evaluate(1, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) def test_operator_gene_basic(self): @@ -111,6 +116,7 @@ def test_operator_gene_basic(self): num_examples = 1 num_genes = 2 num_features = 3 + targets = [0] * num_examples # create constants = [] @@ -125,5 +131,5 @@ def test_operator_gene_basic(self): [4]]) # run the evaluate - gene.evaluate(1, eval_matrix, data_matrix, constants) + error = gene.evaluate(1, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) From 65b519e191141e43ed141a9ba219d652c07ea680 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Fri, 31 Mar 2017 21:48:16 -0400 Subject: [PATCH 02/51] Chromosome evaluate method. --- mep/genetics/chromosome.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index 6d58da2..3d9c6aa 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -1,4 +1,5 @@ import logging +import numpy as np from mep.genetics.gene import Gene, VariableGene, OperatorGene from random import random, randint, choice @@ -29,7 +30,9 @@ def __init__(self, genes, constants): self.genes = genes self.constants = constants - # TODO: track the best fitness and the associated best gene seen so far + # track the best found error and the associated gene + self.error = float('inf') + self.best_gene_index = -1 @classmethod def generate_random_chromosome(cls, num_constants, constants_min, constants_max, constants_prob, @@ -86,6 +89,25 @@ def generate_random_chromosome(cls, num_constants, constants_min, constants_max, # construct and return the chromosome return Chromosome(genes, constants) + def evaluate(self, data_matrix, targets): + """ + Evaluate the various genes. + + :param data_matrix: the data matrix; rows are feature vectors; comes from the data set; it is (n, m) where "n" + is the number of examples and "m" is the number of features. + :type data_matrix: np.matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + """ + num_examples = data_matrix.shape[0] + eval_matrix = np.zeros((len(self.genes), num_examples)) + for gene_index, gene in enumerate(self.genes): + # compute the error for this gene; if it is the best we have found then update + error = gene.evaluate(gene_index, eval_matrix, data_matrix, self.constants, targets) + if error < self.error: + self.error = error + self.best_gene_index = gene_index + def __str__(self): return "Chromosome({}, {})".format(self.genes, self.constants) From f9fcd4a39b2055412201f998e348b09a45f6c084 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 1 Apr 2017 12:52:55 -0400 Subject: [PATCH 03/51] This adds the test for the chromosome evaluate method. --- tests/mep/genetics/test_chromosome.py | 35 ++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py index 8cecf9d..628c803 100644 --- a/tests/mep/genetics/test_chromosome.py +++ b/tests/mep/genetics/test_chromosome.py @@ -1,10 +1,26 @@ import unittest import random -from mep.genetics.gene import VariableGene, OperatorGene +from mep.genetics.gene import VariableGene, OperatorGene, Gene from mep.genetics.chromosome import Chromosome import numpy as np +class MockedGene(Gene): + def __init__(self, error_to_return): + """ + Initialize. + :param error_to_return: what to return in the evaluate + :type error_to_return: float + """ + self.error_to_return = error_to_return + + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): + """ + Simple mocked version. + """ + return self.error_to_return + + class TestChromosome(unittest.TestCase): """ Tests for the chromosome. @@ -38,3 +54,20 @@ def test_basic_random_construction(self): # verify constant self.assertAlmostEquals(8.599796663725433, chromosome.constants[0]) + + def test_evaluate(self): + """ + Basic test of the evaluate method. + """ + # construct mocked genes + genes = [MockedGene(10), MockedGene(1)] + + # construct chromosome + chromosome = Chromosome(genes, constants=[1, 2, 3]) + + # evaluate + chromosome.evaluate(np.zeros((2, 2)), targets=[20, 30]) + + # confirm the genes + self.assertEqual(genes[1], genes[chromosome.best_gene_index]) + self.assertEqual(genes[1].error_to_return, chromosome.error) From 8ecb9629a6797d7f2ce7f90a0f2c3ff42221da31 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 1 Apr 2017 13:57:34 -0400 Subject: [PATCH 04/51] Initialize mechanism for population and sorting chromosomes. --- mep/genetics/chromosome.py | 10 ++++ mep/genetics/population.py | 67 ++++++++++++++++++++++++++- tests/mep/genetics/test_chromosome.py | 16 +++++++ 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index 3d9c6aa..0fe8be5 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -113,3 +113,13 @@ def __str__(self): def __repr__(self): return self.__str__() + + def __lt__(self, other): + """ + Less-than used by sort(...) + + :param other: + :type other: Chromosome + :return: + """ + return self.error < other.error \ No newline at end of file diff --git a/mep/genetics/population.py b/mep/genetics/population.py index 55ae601..dcfd097 100644 --- a/mep/genetics/population.py +++ b/mep/genetics/population.py @@ -1,4 +1,69 @@ +from mep.genetics.chromosome import Chromosome + + class Population(object): """ A collection of chromosomes. - """ \ No newline at end of file + """ + + def __init__(self, data_matrix, targets, num_constants, constants_min, constants_max, constants_prob, + feature_variable_prob, num_genes, num_chromosomes, operators_prob): + """ + Build a randomly constructed chromosome. + + :param data_matrix: the data matrix; rows are feature vectors; comes from the data set; it is (n, m) where "n" + is the number of examples and "m" is the number of features. + :type data_matrix: np.matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + :param num_constants: how many constants to have + :type num_constants: int + :param constants_min: the min range of the constants + :type constants_min: float + :param constants_max: the max range of the constants + :type constants_max: float + :param constants_prob: the probability that a given gene is a constant + :type constants_prob: float + :param feature_variable_prob: the probability that a given gene is a feature variable + :type feature_variable_prob: float + :param num_genes: how many genes + :type num_genes: int + :param num_chromosomes: how many chromosomes to use + :type num_chromosomes: int + :param operators_prob: the probability that a given gene is an operator + :type operators_prob: float + """ + # set the variables + self.data_matrix = data_matrix + self.targets = targets + self.num_constants = num_constants + self.constants_min = constants_min + self.constants_max = constants_max + self.constants_prob = constants_prob + self.feature_variable_prob = feature_variable_prob + self.num_feature_variables = self.data_matrix.shape[1] + self.num_genes = num_genes + self.num_chromosomes = num_chromosomes + self.operators_prob = operators_prob + + # the chromosomes + self.chromosomes = None + + def initialize(self): + """ + Initialize the random chromosomes. + """ + # generate the random chromosomes + self.chromosomes = [Chromosome.generate_random_chromosome(self.num_constants, self.constants_min, + self.constants_max, self.constants_prob, + self.feature_variable_prob, + self.num_feature_variables, self.num_genes, + self.operators_prob) + for _ in range(self.num_chromosomes)] + + # evaluate + # TODO: this could be done in parallel + for chromosome in self.chromosomes: + chromosome.evaluate(self.data_matrix, self.targets) + + # TODO: sort them? diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py index 628c803..f498274 100644 --- a/tests/mep/genetics/test_chromosome.py +++ b/tests/mep/genetics/test_chromosome.py @@ -71,3 +71,19 @@ def test_evaluate(self): # confirm the genes self.assertEqual(genes[1], genes[chromosome.best_gene_index]) self.assertEqual(genes[1].error_to_return, chromosome.error) + + def test_sort(self): + """ + Test the sort mechanism. + """ + # construct the chromosomes and test sorting them (by error) + min_chromosome, mid_chromosome, max_chromosome = Chromosome([], []), Chromosome([], []), Chromosome([], []) + min_chromosome.error = 1 + mid_chromosome.error = 2 + max_chromosome.error = 3 + chromosomes = [mid_chromosome, max_chromosome, min_chromosome] + expected_chromosomes = [min_chromosome, mid_chromosome, max_chromosome] + + # do the sort and verify + chromosomes.sort() + self.assertEqual(expected_chromosomes, chromosomes) From 7dd2cc709004854452827c9896a708bcbb0bb8ae Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 1 Apr 2017 15:00:28 -0400 Subject: [PATCH 05/51] Random tournament selection. --- mep/genetics/population.py | 27 +++++++++++++- tests/mep/genetics/test_population.py | 51 +++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 tests/mep/genetics/test_population.py diff --git a/mep/genetics/population.py b/mep/genetics/population.py index dcfd097..073fd7d 100644 --- a/mep/genetics/population.py +++ b/mep/genetics/population.py @@ -1,4 +1,5 @@ from mep.genetics.chromosome import Chromosome +import random class Population(object): @@ -66,4 +67,28 @@ def initialize(self): for chromosome in self.chromosomes: chromosome.evaluate(self.data_matrix, self.targets) - # TODO: sort them? + # sort the chromosomes + self.chromosomes.sort() + + def random_tournament_selection(self, tournament_size): + """ + Randomly select (tournament_size) chromosomes and return the best one. + :param tournament_size: the size of the tournament + :type tournament_size: int + :return: the + """ + # TODO: Check for bad tournament size + best_chromosome = None + for _ in range(tournament_size): + chromosome = random.choice(self.chromosomes) + if best_chromosome is None or chromosome.error < best_chromosome.error: + best_chromosome = chromosome + + return best_chromosome + + def next_generation(self): + """ + Advance to the next generation. + """ + # TODO: populate + diff --git a/tests/mep/genetics/test_population.py b/tests/mep/genetics/test_population.py new file mode 100644 index 0000000..29b235f --- /dev/null +++ b/tests/mep/genetics/test_population.py @@ -0,0 +1,51 @@ +import unittest +import random +import numpy as np +from mep.genetics.population import Population +from mep.genetics.chromosome import Chromosome + +class TestPopulation(unittest.TestCase): + """ + Test the Population class. + """ + + def test_random_tournament_selection(self): + """ + Test the random_tournament_selection(...) + """ + # make it so this repeatable + random.seed(0) + + # construct the population + num_examples = 5 + num_features = 7 + population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1, 1) + + # confirm the number of feature variables (not critical for this test) + self.assertEqual(num_features, population.num_feature_variables) + + # test the tournament selection; not that it randomly chooses the not as good chromosome + min_chromosome, max_chromosome = Chromosome([], []), Chromosome([], []) + min_chromosome.error = 1 + max_chromosome.error = 2 + population.chromosomes = [min_chromosome, max_chromosome] + self.assertEqual(max_chromosome, population.random_tournament_selection(1)) + + def test_larger_random_tournament_selection(self): + """ + Test the random_tournament_selection(...) + """ + # make it so this repeatable + random.seed(0) + + # construct the population + num_examples = 5 + num_features = 7 + population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1, 1) + + # test the tournament selection; not that it randomly chooses the not as good chromosome + min_chromosome, max_chromosome = Chromosome([], []), Chromosome([], []) + min_chromosome.error = 1 + max_chromosome.error = 2 + population.chromosomes = [min_chromosome, max_chromosome] + self.assertEqual(min_chromosome, population.random_tournament_selection(10)) \ No newline at end of file From f110899d2ecc25bb72c7b6a0a8ab5afbffbef5cf Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Thu, 6 Apr 2017 21:44:06 -0400 Subject: [PATCH 06/51] Chromosome crossover logic. --- mep/genetics/population.py | 49 +++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/mep/genetics/population.py b/mep/genetics/population.py index 073fd7d..be477a7 100644 --- a/mep/genetics/population.py +++ b/mep/genetics/population.py @@ -47,6 +47,10 @@ def __init__(self, data_matrix, targets, num_constants, constants_min, constants self.num_chromosomes = num_chromosomes self.operators_prob = operators_prob + # TODO: take in + self.crossover_prob = 0.9 + self.mutation_prob = 0.1 + # the chromosomes self.chromosomes = None @@ -86,9 +90,52 @@ def random_tournament_selection(self, tournament_size): return best_chromosome + def one_cut_point_crossover(self, parent1, parent2): + """ + Construct two offspring chromosomes from the parents. We determine the crossover point so that we + take the first genes up to that point from parent1/parent2 and then we switch. + :param parent1: one parent chromosome + :type parent1: Chromosome + :param parent2: the other parent chromosome + :type parent2: Chromosome + :return: two offsprings + :rtype: (Chromosome, Chromosome) + """ + # construct the genes and constants for the offsprings from the parents + offspring1 = Chromosome([], []) + offspring2 = Chromosome([], []) + + # determine the crossover point; + cutting_point = random.randint(0, self.num_genes) + + # copy over the genes; first half and now the 2nd half (from the other chromosome) + offspring1.genes = parent1.genes[:cutting_point] + parent2.genes[cutting_point:] + offspring2.genes = parent2.genes[:cutting_point] + parent1.genes[cutting_point:] + + # same thing with the constants + cutting_point = random.randint(0, self.num_constants) + + # copy over the constants; first half and now the 2nd half + offspring1.constants = parent1.constants[:cutting_point] + parent2.constants[cutting_point:] + offspring2.constants = parent2.constants[:cutting_point] + parent1.constants[cutting_point:] + + return offspring1, offspring2 + def next_generation(self): """ Advance to the next generation. """ - # TODO: populate + for _ in range(0, len(self.chromosomes), 2): + # select parents + chromosome1 = self.random_tournament_selection(2) + chromosome2 = self.random_tournament_selection(2) + + # crossover + if random.random() < self.crossover_prob: + # TODO: do crossover + pass + else: + # offspring are copies of the parents + pass + # TODO: fill in From 846e47576410a3bc121728cbb28584e24e773b78 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 8 Apr 2017 18:38:27 -0400 Subject: [PATCH 07/51] Mutation method. --- mep/genetics/chromosome.py | 51 +++++++++++++++++++++++++++++++++++++- mep/genetics/population.py | 23 ++++++++++++++--- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index 0fe8be5..34329b3 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -108,6 +108,55 @@ def evaluate(self, data_matrix, targets): self.error = error self.best_gene_index = gene_index + def mutate(self, gene_mutation_prob, num_constants, constants_min, constants_max, constants_prob, + feature_variable_prob, num_feature_variables, num_genes, operators_prob): + """ + Mutate the chromosome. Works by going through and randomly mutating genes and then constants. + :param gene_mutation_prob: probability to mutate a given gene + :type gene_mutation_prob: float + :param num_constants: how many constants to have + :type num_constants: int + :param constants_min: the min range of the constants + :type constants_min: float + :param constants_max: the max range of the constants + :type constants_max: float + :param constants_prob: the probability that a given gene is a constant + :type constants_prob: float + :param feature_variable_prob: the probability that a given gene is a feature variable + :type feature_variable_prob: float + :param num_feature_variables: how many features we have + :type num_feature_variables: int + :param num_genes: how many genes + :type num_genes: int + :param operators_prob: the probability that a given gene is an operator + :type operators_prob: float + :return: nothing + """ + # the probabilities are all the same for generating a random chromosome; therefore let's construct + # a random chromosome and then (effectively) do a uniform crossover where a "mutate" means that we + # take the new chromosome's gene/constants + # TODO: Should we have these variables set in the chromosome then? + # TODO: maybe just pass in this random chromosome then? + random_chromosome = Chromosome.generate_random_chromosome(num_constants, constants_min, + constants_max, constants_prob, + feature_variable_prob, + num_feature_variables, num_genes, + operators_prob) + + # go through mutating genes; + for gene_index in range(len(self.genes)): + # decide if we are going to mutate this gene + if random() <= gene_mutation_prob: + # mutated; therefore grab the corresponding gene from the random chromosome + self.genes[gene_index] = random_chromosome.genes[gene_index] + + # go through mutating constants; + for constants_index in range(len(self.constants)): + # decide if we are going to mutate this gene + if random() <= gene_mutation_prob: + # mutated; therefore grab the corresponding constant from the random chromosome + self.constants[constants_index] = random_chromosome.constants[constants_index] + def __str__(self): return "Chromosome({}, {})".format(self.genes, self.constants) @@ -122,4 +171,4 @@ def __lt__(self, other): :type other: Chromosome :return: """ - return self.error < other.error \ No newline at end of file + return self.error < other.error diff --git a/mep/genetics/population.py b/mep/genetics/population.py index be477a7..fc73366 100644 --- a/mep/genetics/population.py +++ b/mep/genetics/population.py @@ -1,5 +1,6 @@ from mep.genetics.chromosome import Chromosome import random +import copy class Population(object): @@ -108,6 +109,7 @@ def one_cut_point_crossover(self, parent1, parent2): # determine the crossover point; cutting_point = random.randint(0, self.num_genes) + # TODO: copy the genes # copy over the genes; first half and now the 2nd half (from the other chromosome) offspring1.genes = parent1.genes[:cutting_point] + parent2.genes[cutting_point:] offspring2.genes = parent2.genes[:cutting_point] + parent1.genes[cutting_point:] @@ -132,10 +134,23 @@ def next_generation(self): # crossover if random.random() < self.crossover_prob: - # TODO: do crossover - pass + offspring1, offspring2 = self.one_cut_point_crossover(chromosome1, chromosome2) else: # offspring are copies of the parents - pass - # TODO: fill in + offspring1 = copy.copy(chromosome1) + offspring2 = copy.copy(chromosome2) + + # mutate (potentially) offspring + offspring1.mutate(self.mutation_prob, self.num_constants, self.constants_min, + self.constants_max, self.constants_prob, + self.feature_variable_prob, + self.num_feature_variables, self.num_genes, + self.operators_prob) + # TODO: evaluate + offspring2.mutate(self.mutation_prob, self.num_constants, self.constants_min, + self.constants_max, self.constants_prob, + self.feature_variable_prob, + self.num_feature_variables, self.num_genes, + self.operators_prob) + # TODO: fill in From 7059d0f2323720b328f4fa22dcd6d7f54285d73d Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 8 Apr 2017 19:08:42 -0400 Subject: [PATCH 08/51] Offsping evaluation and insertion. --- mep/genetics/population.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/mep/genetics/population.py b/mep/genetics/population.py index fc73366..da5e3b0 100644 --- a/mep/genetics/population.py +++ b/mep/genetics/population.py @@ -140,17 +140,41 @@ def next_generation(self): offspring1 = copy.copy(chromosome1) offspring2 = copy.copy(chromosome2) - # mutate (potentially) offspring + # TODO: we could consolidate the offspring code into one method and just call it twice + # mutate (potentially) offspring; calculate error offspring1.mutate(self.mutation_prob, self.num_constants, self.constants_min, self.constants_max, self.constants_prob, self.feature_variable_prob, self.num_feature_variables, self.num_genes, self.operators_prob) - # TODO: evaluate + offspring1.evaluate(self.data_matrix, self.targets) offspring2.mutate(self.mutation_prob, self.num_constants, self.constants_min, self.constants_max, self.constants_prob, self.feature_variable_prob, self.num_feature_variables, self.num_genes, self.operators_prob) + offspring2.evaluate(self.data_matrix, self.targets) + + # replace the worst chromosome in the population; note that the chromosomes start in a sorted + # order so the one at the end has the highest error; we now insert the offspring into the list + # at their error level -- i.e. keep it in sorted order + # TODO: We should be able to do this in one loop but let's do each offspring separately as it is clearer + insert_index = -1 + for chromosome_index, chromosome in enumerate(self.chromosomes): + if offspring1.error < chromosome.error: + insert_index = chromosome_index + break + # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index + if insert_index > -1: + self.chromosomes.insert(insert_index, offspring1) + + # now the other offspring + insert_index = -1 + for chromosome_index, chromosome in enumerate(self.chromosomes): + if offspring2.error < chromosome.error: + insert_index = chromosome_index + break + # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index + if insert_index > -1: + self.chromosomes.insert(insert_index, offspring2) - # TODO: fill in From c5b124d14af835289662a0e07067b85776db74c1 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sun, 9 Apr 2017 09:35:16 -0400 Subject: [PATCH 09/51] Running the population. Messy prints but working. --- mep/config/config.json | 2 +- mep/dataset.py | 8 ++++++-- mep/genetics/chromosome.py | 32 ++++++++++++++++++++++++++++++++ mep/genetics/gene.py | 11 +++++++++++ mep/genetics/population.py | 6 ++---- mep/main.py | 38 +++++++++++++++++++++++++++++++++----- 6 files changed, 85 insertions(+), 12 deletions(-) diff --git a/mep/config/config.json b/mep/config/config.json index d1dea47..efb51db 100644 --- a/mep/config/config.json +++ b/mep/config/config.json @@ -8,7 +8,7 @@ "mutation_probability": 0.1, "crossover_probability": 0.9, - "variables_probability": 0.4, + "feature_variables_probability": 0.4, "operators_probability": 0.5, "num_constants": 3, diff --git a/mep/dataset.py b/mep/dataset.py index 850d93a..4f09241 100644 --- a/mep/dataset.py +++ b/mep/dataset.py @@ -13,5 +13,9 @@ def __init__(self, filename): :param filename: the filename (full path to CSV) of the data :type filename: str """ - # TODO: What about supporting other file formats? - self.data = pd.read_csv(filename) \ No newline at end of file + # we assume this in the format of feature cols and then target + self.data = pd.read_csv(filename) + + # extract out data matrix and target + self.target = self.data.target.values + self.data_matrix = self.data.drop("target", 1).values diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index 34329b3..ca96f4f 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -160,6 +160,38 @@ def mutate(self, gene_mutation_prob, num_constants, constants_min, constants_max def __str__(self): return "Chromosome({}, {})".format(self.genes, self.constants) + def pretty_string(self, stop_at_best=True): + """ + Output in a program like format. First show the constants. Then one line per gene. + :return: the program + :rtype: str + """ + # first we show the constants + program = "CONSTANTS = [{}]\n".format(",".join([str(c) for c in self.constants])) + + # now show each gene on a separate line + for gene_index, gene in enumerate(self.genes): + gene_str = gene.__str__() + if type(gene) == VariableGene: + gene_str = gene.pretty_string() + elif type(gene) == OperatorGene: + # TODO: Push this logic into the gene; the only tricky part is the operator lambda; we will probably + # need to replace the lambda with a larger object + if gene.operation == Chromosome.operator_lambdas[0]: + op = "+" + elif gene.operation == Chromosome.operator_lambdas[1]: + op = "-" + elif gene.operation == Chromosome.operator_lambdas[2]: + op = "*" + gene_str = "PROGRAM[{}] {} PROGRAM[{}]".format(gene.address1, op, gene.address2) + program += "{}:{}\n".format(gene_index, gene_str) + + if self.best_gene_index == gene_index and stop_at_best: + return program + + # if we want to print the full program + return program + def __repr__(self): return self.__str__() diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index 14ce831..b1adc0a 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -97,6 +97,17 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): def __str__(self): return "VariableGene({}, is_feature={})".format(self.index, self.is_feature) + def pretty_string(self): + """ + Pretty program string version. + :return: string version + :rtype: str + """ + if self.is_feature: + return "FEATURES[{}]".format(self.index) + else: + return "CONSTANTS[{}]".format(self.index) + def __repr__(self): return self.__str__() diff --git a/mep/genetics/population.py b/mep/genetics/population.py index da5e3b0..2aea971 100644 --- a/mep/genetics/population.py +++ b/mep/genetics/population.py @@ -8,7 +8,7 @@ class Population(object): A collection of chromosomes. """ - def __init__(self, data_matrix, targets, num_constants, constants_min, constants_max, constants_prob, + def __init__(self, data_matrix, targets, num_constants, constants_min, constants_max, feature_variable_prob, num_genes, num_chromosomes, operators_prob): """ Build a randomly constructed chromosome. @@ -24,8 +24,6 @@ def __init__(self, data_matrix, targets, num_constants, constants_min, constants :type constants_min: float :param constants_max: the max range of the constants :type constants_max: float - :param constants_prob: the probability that a given gene is a constant - :type constants_prob: float :param feature_variable_prob: the probability that a given gene is a feature variable :type feature_variable_prob: float :param num_genes: how many genes @@ -41,7 +39,7 @@ def __init__(self, data_matrix, targets, num_constants, constants_min, constants self.num_constants = num_constants self.constants_min = constants_min self.constants_max = constants_max - self.constants_prob = constants_prob + self.constants_prob = 1. - operators_prob - feature_variable_prob self.feature_variable_prob = feature_variable_prob self.num_feature_variables = self.data_matrix.shape[1] self.num_genes = num_genes diff --git a/mep/main.py b/mep/main.py index b42be36..e7abdd6 100644 --- a/mep/main.py +++ b/mep/main.py @@ -3,14 +3,20 @@ import json import logging import os +from dataset import DataSet +from mep.genetics.population import Population if __name__ == "__main__": - # TODO: Get the data file + # TODO: error check usage + + # get the data file + data_set_name = sys.argv[1] + data_set = DataSet(data_set_name) # read config file - # TODO: Possible config file override on comand line - with open("mep/config/config.json") as data_file: - config = json.load(data_file) + # TODO: Possible config file override on command line + with open("mep/config/config.json") as config_file: + config = json.load(config_file) # construct output logs dir if it doesn't exist output_logs_dir = config["output_logs"] @@ -25,5 +31,27 @@ logger = logging.getLogger("main") logger.info("Starting up...") - + # construct a population and run it for the number of generations specified + population = Population(data_set.data_matrix, data_set.target, int(config["num_constants"]), + float(config["constants_min"]), float(config["constants_max"]), + float(config["feature_variables_probability"]), + int(config["code_length"]), int(config["population_size"]), + float(config["operators_probability"])) + population.initialize() + + # iterate through the generations + best_chromosome = None + for generation in range(int(config["num_generations"])): + best_chromosome = population.chromosomes[0] + logger.debug("Generation number {} best chromosome error {}".format(generation, + best_chromosome.error)) + print("Generation number {} best chromosome error {}".format(generation, + best_chromosome.error)) + if best_chromosome.error == 0: + logger.debug("Exiting early as we have hit the best possible error.") + break + population.next_generation() + print("Best chromosome error {} and chromosome {}".format(best_chromosome.error, best_chromosome)) + print("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error, + best_chromosome.pretty_string())) From fee6039a99b70243f2e64215c8a5e6bc71dfcdc0 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Tue, 11 Apr 2017 19:49:36 -0400 Subject: [PATCH 10/51] Updated the readme --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 632143f..711bfd8 100644 --- a/README.md +++ b/README.md @@ -1 +1,7 @@ -# py-mep \ No newline at end of file +# Multi Expression Programming + +This is an implmentation of the MEP algorithm defined here: + +Oltean Mihai, D. Dumitrescu, Multi Expression Programming, Technical report, UBB. + +Based upon the C++ code here: https://github.com/mepx/mep-basic-src. From d7f0b04dadb1407aaf53d94d72f644719da6b90c Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Tue, 11 Apr 2017 20:11:59 -0400 Subject: [PATCH 11/51] The environment.yaml to build the conda environment. --- README.md | 4 +- environment.yaml | 95 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 environment.yaml diff --git a/README.md b/README.md index 711bfd8..78e8182 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,6 @@ This is an implmentation of the MEP algorithm defined here: -Oltean Mihai, D. Dumitrescu, Multi Expression Programming, Technical report, UBB. +> Oltean Mihai, D. Dumitrescu, Multi Expression Programming, Technical report, UBB. -Based upon the C++ code here: https://github.com/mepx/mep-basic-src. +Based upon the C++ code [here](https://github.com/mepx/mep-basic-src). \ No newline at end of file diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..61b1b1c --- /dev/null +++ b/environment.yaml @@ -0,0 +1,95 @@ +name: py-mep-dev +dependencies: +- backports=1.0=py27_0 +- backports_abc=0.5=py27_0 +- bleach=1.5.0=py27_0 +- configparser=3.5.0=py27_0 +- dbus=1.10.10=0 +- decorator=4.0.11=py27_0 +- entrypoints=0.2.2=py27_1 +- enum34=1.1.6=py27_0 +- expat=2.1.0=0 +- fontconfig=2.12.1=3 +- freetype=2.5.5=2 +- functools32=3.2.3.2=py27_0 +- get_terminal_size=1.0.0=py27_0 +- glib=2.50.2=1 +- gst-plugins-base=1.8.0=0 +- gstreamer=1.8.0=0 +- html5lib=0.999=py27_0 +- icu=54.1=0 +- ipykernel=4.6.0=py27_0 +- ipython=5.3.0=py27_0 +- ipython_genutils=0.2.0=py27_0 +- ipywidgets=6.0.0=py27_0 +- jinja2=2.9.6=py27_0 +- jpeg=9b=0 +- jsonschema=2.5.1=py27_0 +- jupyter=1.0.0=py27_3 +- jupyter_client=5.0.1=py27_0 +- jupyter_console=5.1.0=py27_0 +- jupyter_core=4.3.0=py27_0 +- libffi=3.2.1=1 +- libgcc=5.2.0=0 +- libiconv=1.14=0 +- libpng=1.6.27=0 +- libsodium=1.0.10=0 +- libxcb=1.12=1 +- libxml2=2.9.4=0 +- markupsafe=0.23=py27_2 +- mistune=0.7.4=py27_0 +- mkl=2017.0.1=0 +- nbconvert=5.1.1=py27_0 +- nbformat=4.3.0=py27_0 +- notebook=5.0.0=py27_0 +- numpy=1.12.1=py27_0 +- openssl=1.0.2k=1 +- pandas=0.19.2=np112py27_1 +- pandocfilters=1.4.1=py27_0 +- path.py=10.1=py27_0 +- pathlib2=2.2.1=py27_0 +- pcre=8.39=1 +- pexpect=4.2.1=py27_0 +- pickleshare=0.7.4=py27_0 +- pip=9.0.1=py27_1 +- prompt_toolkit=1.0.14=py27_0 +- ptyprocess=0.5.1=py27_0 +- py=1.4.32=py27_0 +- pygments=2.2.0=py27_0 +- pyqt=5.6.0=py27_2 +- pytest=3.0.7=py27_0 +- python=2.7.13=0 +- python-dateutil=2.6.0=py27_0 +- pytz=2017.2=py27_0 +- pyzmq=16.0.2=py27_0 +- qt=5.6.2=3 +- qtconsole=4.3.0=py27_0 +- readline=6.2=2 +- scandir=1.5=py27_0 +- setuptools=27.2.0=py27_0 +- simplegeneric=0.8.1=py27_1 +- singledispatch=3.4.0.3=py27_0 +- sip=4.18=py27_0 +- six=1.10.0=py27_0 +- sqlite=3.13.0=0 +- ssl_match_hostname=3.4.0.2=py27_1 +- terminado=0.6=py27_0 +- testpath=0.3=py27_0 +- tk=8.5.18=0 +- tornado=4.4.2=py27_0 +- traitlets=4.3.2=py27_0 +- wcwidth=0.1.7=py27_0 +- wheel=0.29.0=py27_0 +- widgetsnbextension=2.0.0=py27_0 +- zeromq=4.1.5=0 +- zlib=1.2.8=3 +- pip: + - backports-abc==0.5 + - backports.shutil-get-terminal-size==1.0.0 + - backports.ssl-match-hostname==3.4.0.2 + - ipython-genutils==0.2.0 + - jupyter-client==5.0.1 + - jupyter-console==5.1.0 + - jupyter-core==4.3.0 + - prompt-toolkit==1.0.14 + From b0e082d27eaba41ab7f9da2adbb3fd66887fbead Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Tue, 11 Apr 2017 20:27:26 -0400 Subject: [PATCH 12/51] Example datasets. --- .gitignore | 3 ++- README.md | 13 ++++++++++++- datasets/data1.csv | 10 ++++++++++ datasets/data2.csv | 10 ++++++++++ datasets/files.txt | 2 ++ 5 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 datasets/data1.csv create mode 100644 datasets/data2.csv create mode 100644 datasets/files.txt diff --git a/.gitignore b/.gitignore index 8419443..9eda0dc 100644 --- a/.gitignore +++ b/.gitignore @@ -92,4 +92,5 @@ ENV/ .idea/* # logs -output_logs/* \ No newline at end of file +output_logs/* +ignored/* \ No newline at end of file diff --git a/README.md b/README.md index 78e8182..192029c 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,15 @@ This is an implmentation of the MEP algorithm defined here: > Oltean Mihai, D. Dumitrescu, Multi Expression Programming, Technical report, UBB. -Based upon the C++ code [here](https://github.com/mepx/mep-basic-src). \ No newline at end of file +Based upon the C++ code [here](https://github.com/mepx/mep-basic-src). + +## Running py-mep + +Create the conda environment and source it (Linux): + +``` +conda env create -f environment.yml +source activate py-mep-dev +``` + +Example, running with a dataset `python -m mep.main datasets/data1.csv` \ No newline at end of file diff --git a/datasets/data1.csv b/datasets/data1.csv new file mode 100644 index 0000000..2a906b5 --- /dev/null +++ b/datasets/data1.csv @@ -0,0 +1,10 @@ +x1,x2,target +0,0,0 +1,2,3 +12,2,14 +-12,90,78 +3,4,7 +0,-1,-1 +23,0,23 +8,16,24 +-10,-15,-25 diff --git a/datasets/data2.csv b/datasets/data2.csv new file mode 100644 index 0000000..f839365 --- /dev/null +++ b/datasets/data2.csv @@ -0,0 +1,10 @@ +x1,x2,x3,target +0,0,0,0 +1,2,2,4 +12,2,4,28 +-12,90,5,-1075 +3,4,1,13 +0,-1,-10,-10 +23,0,15,15 +8,16,1,129 +-10,-15,-50,100 diff --git a/datasets/files.txt b/datasets/files.txt new file mode 100644 index 0000000..d8aa9d8 --- /dev/null +++ b/datasets/files.txt @@ -0,0 +1,2 @@ +data1.csv is f(x_1, x_2) = x_1 + x_2 +data2.csv is f(x_1, x_2) = x_1 * x_2 + x_3 From bdf3ab2a4e5800ac699f52fccd94e794ead1ed9e Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Fri, 14 Apr 2017 12:14:31 -0400 Subject: [PATCH 13/51] Fix the missing 's' in the logger format. --- mep/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mep/main.py b/mep/main.py index e7abdd6..d8afb6e 100644 --- a/mep/main.py +++ b/mep/main.py @@ -27,7 +27,7 @@ logging.basicConfig(filename="{}/MEP_{}.log".format(output_logs_dir, dt.datetime.now().strftime("%Y%m%d")), level=logging.DEBUG, filemode='w', - format="%(asctime)s %(name)s %(funcName)s %(levelname) %(message)s") + format="%(asctime)s %(name)s %(funcName)s %(levelname)s %(message)s") logger = logging.getLogger("main") logger.info("Starting up...") From 545d2717d63a6242a9064b7d56949abbbadf7eb7 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Fri, 14 Apr 2017 20:01:27 -0400 Subject: [PATCH 14/51] Logic to prune the unused genes. --- mep/genetics/chromosome.py | 51 +++++++++++++++++++++++++++++++++++++- mep/main.py | 17 +++++++++---- 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index ca96f4f..172f658 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -1,5 +1,6 @@ import logging import numpy as np +from collections import deque from mep.genetics.gene import Gene, VariableGene, OperatorGene from random import random, randint, choice @@ -24,7 +25,7 @@ def __init__(self, genes, constants): :param constants: the constants :type constants: list of float """ - # self.logger = logging.getLogger(self.__class__) + self.logger = logging.getLogger(self.__class__.__name__) # core genes and constants lists self.genes = genes @@ -192,6 +193,54 @@ def pretty_string(self, stop_at_best=True): # if we want to print the full program return program + def prune(self): + """ + Trim out the unused genes. NOTE: This "breaks" the chromosomes as it is going to change how many genes are + in the program. Only do this once we have finished evolving the program. + """ + # the best gene index is going to be the last line of the program; since the genes never reference genes + # beyond it then we just proceed back to the top and remove any which haven't been referenced; we determine + # this via a BFS type search + + # the genes that are in use -- i.e. that will be kept; + gene_indices_in_use = set() + visited = set() + + # start from best gene index + genes_indices_to_visit = deque() + genes_indices_to_visit.appendleft(self.best_gene_index) + gene_indices_in_use.add(self.best_gene_index) + + while len(genes_indices_to_visit) > 0: + # the index to visit + gene_index = genes_indices_to_visit.pop() + + # mark as visited + visited.add(gene_index) + + # check the addresses on the gene if it is an operator + gene = self.genes[gene_index] + if type(gene) == OperatorGene: + genes_indices_to_visit.appendleft(gene.address1) + genes_indices_to_visit.appendleft(gene.address2) + gene_indices_in_use.add(gene.address1) + gene_indices_in_use.add(gene.address2) + self.logger.debug("At gene index {} which references {} and {}".format(gene_index, + gene.address1, gene.address2)) + + # now remove any genes that aren't used + gene_indices_in_use = list(gene_indices_in_use) + gene_indices_in_use.sort() + self.logger.debug("All gene indices in use {}".format(gene_indices_in_use)) + self.genes = [self.genes[i] for i in gene_indices_in_use] + + # TODO: This could be done in the list comprehension but it is clearer to just do another pass + # re-map the address to the new index + for gene in self.genes: + if type(gene) == OperatorGene: + gene.address1 = gene_indices_in_use.index(gene.address1) + gene.address2 = gene_indices_in_use.index(gene.address2) + def __repr__(self): return self.__str__() diff --git a/mep/main.py b/mep/main.py index d8afb6e..985bdd1 100644 --- a/mep/main.py +++ b/mep/main.py @@ -45,13 +45,20 @@ best_chromosome = population.chromosomes[0] logger.debug("Generation number {} best chromosome error {}".format(generation, best_chromosome.error)) - print("Generation number {} best chromosome error {}".format(generation, - best_chromosome.error)) if best_chromosome.error == 0: logger.debug("Exiting early as we have hit the best possible error.") break population.next_generation() - print("Best chromosome error {} and chromosome {}".format(best_chromosome.error, best_chromosome)) - print("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error, - best_chromosome.pretty_string())) + logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error, + best_chromosome.pretty_string())) + + # prune out the unused genes + best_chromosome.prune() + logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error, + best_chromosome.pretty_string())) + + # TODO: Convert the output to a valid python program + # TODO: Add support for classification + # TODO: Add example digital circuit test + # TODO: Add UDFs From dacec6d1f859511d4d7ed8309d22ee2a50ece2e7 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sun, 16 Apr 2017 13:33:35 -0400 Subject: [PATCH 15/51] Convert to a python program. --- mep/genetics/chromosome.py | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index 172f658..f0e8c5c 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -241,6 +241,55 @@ def prune(self): gene.address1 = gene_indices_in_use.index(gene.address1) gene.address2 = gene_indices_in_use.index(gene.address2) + def to_python(self): + """ + Convert to python program string. + :return: python string program + :rtype: str + """ + # python program string + python_program = """ +import sys + +if __name__ == "__main__": + # constants + {} + + # now the genes + {} + + # print out the final answer + {} + """ + + # constants + constants_str = "constants = {}".format(self.constants) + + # genes + genes_str = "program = [0] * {}\n".format(len(self.genes)) + for gene_index, gene in enumerate(self.genes): + genes_str += " program[{}] = ".format(gene_index) + if type(gene) == VariableGene: + if gene.is_feature: + genes_str += "float(sys.argv[{}])".format(gene.index + 1) + else: + genes_str += "constants[{}]".format(gene.index) + elif type(gene) == OperatorGene: + if gene.operation == Chromosome.operator_lambdas[0]: + op = "+" + elif gene.operation == Chromosome.operator_lambdas[1]: + op = "-" + elif gene.operation == Chromosome.operator_lambdas[2]: + op = "*" + genes_str += "program[{}] {} program[{}]".format(gene.address1, op, gene.address2) + genes_str += "\n" + + # print statement + python_program = python_program.format(constants_str, genes_str, "print(program[{}])".format(len(self.genes)-1)) + + # return it + return python_program + def __repr__(self): return self.__str__() From 6dff56eda7d5c65f1c1d9bdf05604346f27be031 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Mon, 1 May 2017 21:00:43 -0400 Subject: [PATCH 16/51] Print the python program into a file. --- README.md | 2 +- mep/main.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 192029c..24ee508 100644 --- a/README.md +++ b/README.md @@ -15,4 +15,4 @@ conda env create -f environment.yml source activate py-mep-dev ``` -Example, running with a dataset `python -m mep.main datasets/data1.csv` \ No newline at end of file +Example, running with a dataset `python -m mep.main datasets/data1.csv test.py`. This will run a full MEP population evolution to solve the problem specified in the data CSV, determine the best chromosome, prune it, and then convert that chromosome into a functioning python program that can be run by passing in the feature inputs. Example, `python test.py 5 10`. \ No newline at end of file diff --git a/mep/main.py b/mep/main.py index 985bdd1..74f8767 100644 --- a/mep/main.py +++ b/mep/main.py @@ -11,6 +11,7 @@ # get the data file data_set_name = sys.argv[1] + python_file_name = sys.argv[2] data_set = DataSet(data_set_name) # read config file @@ -53,11 +54,20 @@ logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error, best_chromosome.pretty_string())) + # TODO: this should probably be optional # prune out the unused genes best_chromosome.prune() logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error, best_chromosome.pretty_string())) + # TODO: Optional? + # we then convert the chromosome into a valid python program and write it out to file + with open(python_file_name, 'w') as python_file: + python_program = best_chromosome.to_python() + logger.debug("Write out the python program to {}".format(python_file_name)) + logger.debug(python_program) + python_file.write(python_program) + # TODO: Convert the output to a valid python program # TODO: Add support for classification # TODO: Add example digital circuit test From c87cd6143d7eb2a6d78d791ac956a1bcfd1e903f Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Mon, 28 Aug 2017 20:01:43 -0400 Subject: [PATCH 17/51] Fix the README's filename and fix the broken test. --- README.md | 2 +- tests/mep/genetics/test_population.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 24ee508..ad723ed 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Based upon the C++ code [here](https://github.com/mepx/mep-basic-src). Create the conda environment and source it (Linux): ``` -conda env create -f environment.yml +conda env create -f environment.yaml source activate py-mep-dev ``` diff --git a/tests/mep/genetics/test_population.py b/tests/mep/genetics/test_population.py index 29b235f..bce9cd5 100644 --- a/tests/mep/genetics/test_population.py +++ b/tests/mep/genetics/test_population.py @@ -19,7 +19,7 @@ def test_random_tournament_selection(self): # construct the population num_examples = 5 num_features = 7 - population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1, 1) + population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1) # confirm the number of feature variables (not critical for this test) self.assertEqual(num_features, population.num_feature_variables) @@ -41,7 +41,7 @@ def test_larger_random_tournament_selection(self): # construct the population num_examples = 5 num_features = 7 - population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1, 1) + population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1) # test the tournament selection; not that it randomly chooses the not as good chromosome min_chromosome, max_chromosome = Chromosome([], []), Chromosome([], []) From ba35e89259b40dce3c3dd8cc6d9675ea8c1decd5 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Mon, 28 Aug 2017 20:08:50 -0400 Subject: [PATCH 18/51] Bug fix, stop doing an insert, the population size should be fixed. --- mep/genetics/population.py | 4 ++-- mep/main.py | 5 +++-- tests/mep/genetics/test_population.py | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/mep/genetics/population.py b/mep/genetics/population.py index 2aea971..dd60592 100644 --- a/mep/genetics/population.py +++ b/mep/genetics/population.py @@ -164,7 +164,7 @@ def next_generation(self): break # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index if insert_index > -1: - self.chromosomes.insert(insert_index, offspring1) + self.chromosomes[insert_index] = offspring1 # now the other offspring insert_index = -1 @@ -174,5 +174,5 @@ def next_generation(self): break # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index if insert_index > -1: - self.chromosomes.insert(insert_index, offspring2) + self.chromosomes[insert_index] = offspring2 diff --git a/mep/main.py b/mep/main.py index 74f8767..e3528ab 100644 --- a/mep/main.py +++ b/mep/main.py @@ -44,8 +44,9 @@ best_chromosome = None for generation in range(int(config["num_generations"])): best_chromosome = population.chromosomes[0] - logger.debug("Generation number {} best chromosome error {}".format(generation, - best_chromosome.error)) + logger.debug("Generation number {} best chromosome error {} with {} chromosomes ".format( + generation, best_chromosome.error, len(population.chromosomes))) + if best_chromosome.error == 0: logger.debug("Exiting early as we have hit the best possible error.") break diff --git a/tests/mep/genetics/test_population.py b/tests/mep/genetics/test_population.py index bce9cd5..78892b6 100644 --- a/tests/mep/genetics/test_population.py +++ b/tests/mep/genetics/test_population.py @@ -4,6 +4,7 @@ from mep.genetics.population import Population from mep.genetics.chromosome import Chromosome + class TestPopulation(unittest.TestCase): """ Test the Population class. From 5f2fbc587d1e505ef3205d09cf1efa6f5a7d858b Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 30 Dec 2017 16:07:26 -0500 Subject: [PATCH 19/51] This puts in an example evolved program in the README --- README.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ad723ed..64312ca 100644 --- a/README.md +++ b/README.md @@ -15,4 +15,23 @@ conda env create -f environment.yaml source activate py-mep-dev ``` -Example, running with a dataset `python -m mep.main datasets/data1.csv test.py`. This will run a full MEP population evolution to solve the problem specified in the data CSV, determine the best chromosome, prune it, and then convert that chromosome into a functioning python program that can be run by passing in the feature inputs. Example, `python test.py 5 10`. \ No newline at end of file +Example, running with a dataset `python -m mep.main datasets/data1.csv test.py`. This will run a full MEP population evolution to solve the problem specified in the data CSV, determine the best chromosome, prune it, and then convert that chromosome into a functioning python program that can be run by passing in the feature inputs. Example, `python test.py 5 10`. + +An example Python program evolved to solve the addition problem of adding together two features (ex: datasets/data1.csv): +``` +import sys + +if __name__ == "__main__": + # constants + constants = [0.45084442258242485, -0.464331279636617, -0.5128830066318446] + + # now the genes + program = [0] * 3 + program[0] = float(sys.argv[2]) + program[1] = float(sys.argv[1]) + program[2] = program[0] + program[1] + + + # print out the final answer + print(program[2]) +``` \ No newline at end of file From ed408fd3318466bf367beefe7f331ebd0953f84f Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 30 Dec 2017 16:18:48 -0500 Subject: [PATCH 20/51] Add another function example. --- datasets/data3.csv | 20 ++++++++++++++++++++ datasets/files.txt | 1 + 2 files changed, 21 insertions(+) create mode 100644 datasets/data3.csv diff --git a/datasets/data3.csv b/datasets/data3.csv new file mode 100644 index 0000000..ae9e86e --- /dev/null +++ b/datasets/data3.csv @@ -0,0 +1,20 @@ +x1,x2,x3,target +1,2,3,0 +3,2,1,10 +2,3,1,6 +0,0,0,0 +5,4,10,19 +7,6,4,51 +-1,2,5,-2 +2,-1,6,-3 +3,4,-10,23 +5,6,9,22 +-3,-6,-6,9 +-7,4,0,53 +0,2,5,-3 +6,0,-1,37 +-8,5,0,69 +-3,-3,-3,9 +2,1,2,3 +-6,-3,2,31 +0,9,4,5 diff --git a/datasets/files.txt b/datasets/files.txt index d8aa9d8..cbb59c7 100644 --- a/datasets/files.txt +++ b/datasets/files.txt @@ -1,2 +1,3 @@ data1.csv is f(x_1, x_2) = x_1 + x_2 data2.csv is f(x_1, x_2) = x_1 * x_2 + x_3 +data3.csv is f(x_1, x_2, x_3) = x_1 * x_1 + x_2 - x_3 From 5b38a5f6e2580465c0ab1f11847e8388302869c9 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 30 Dec 2017 17:55:44 -0500 Subject: [PATCH 21/51] Instead of just lambdas let's use callable objects. --- mep/genetics/chromosome.py | 37 ++++++------- mep/genetics/gene.py | 3 +- mep/genetics/operator.py | 103 +++++++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 22 deletions(-) create mode 100644 mep/genetics/operator.py diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index f0e8c5c..c7d5738 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -2,6 +2,7 @@ import numpy as np from collections import deque from mep.genetics.gene import Gene, VariableGene, OperatorGene +from mep.genetics.operator import AdditionOperator, MultiplicationOperator, SubtractionOperator from random import random, randint, choice @@ -13,9 +14,9 @@ class Chromosome(object): """ # valid operators - operator_lambdas = [lambda a, b: a + b, # + - lambda a, b: a - b, # - - lambda a, b: a * b] # * + operators_family = [AdditionOperator, + MultiplicationOperator, + SubtractionOperator] def __init__(self, genes, constants): """ @@ -80,7 +81,7 @@ def generate_random_chromosome(cls, num_constants, constants_min, constants_max, prob = random() if prob <= operators_prob: # randomly choose valid addresses; randomly choose an operator - genes.append(OperatorGene(choice(Chromosome.operator_lambdas), + genes.append(OperatorGene(choice(Chromosome.operators_family)(), randint(0, gene_index - 1), randint(0, gene_index - 1))) elif prob <= operators_prob + feature_variable_prob: genes.append(VariableGene(randint(0, num_feature_variables - 1), is_feature=True)) @@ -176,15 +177,8 @@ def pretty_string(self, stop_at_best=True): if type(gene) == VariableGene: gene_str = gene.pretty_string() elif type(gene) == OperatorGene: - # TODO: Push this logic into the gene; the only tricky part is the operator lambda; we will probably - # need to replace the lambda with a larger object - if gene.operation == Chromosome.operator_lambdas[0]: - op = "+" - elif gene.operation == Chromosome.operator_lambdas[1]: - op = "-" - elif gene.operation == Chromosome.operator_lambdas[2]: - op = "*" - gene_str = "PROGRAM[{}] {} PROGRAM[{}]".format(gene.address1, op, gene.address2) + gene_str = "{}(PROGRAM[{}], PROGRAM[{}])".format(gene.operation.function_name(), + gene.address1, gene.address2) program += "{}:{}\n".format(gene_index, gene_str) if self.best_gene_index == gene_index and stop_at_best: @@ -251,6 +245,9 @@ def to_python(self): python_program = """ import sys +# define operator/functions +{} + if __name__ == "__main__": # constants {} @@ -261,6 +258,8 @@ def to_python(self): # print out the final answer {} """ + # define all the function/operators + operator_def_str = "\n".join([operator().function_python_definition() for operator in self.operators_family]) # constants constants_str = "constants = {}".format(self.constants) @@ -275,17 +274,13 @@ def to_python(self): else: genes_str += "constants[{}]".format(gene.index) elif type(gene) == OperatorGene: - if gene.operation == Chromosome.operator_lambdas[0]: - op = "+" - elif gene.operation == Chromosome.operator_lambdas[1]: - op = "-" - elif gene.operation == Chromosome.operator_lambdas[2]: - op = "*" - genes_str += "program[{}] {} program[{}]".format(gene.address1, op, gene.address2) + genes_str += "{}(program[{}], program[{}])".format(gene.operation.function_name(), + gene.address1, gene.address2) genes_str += "\n" # print statement - python_program = python_program.format(constants_str, genes_str, "print(program[{}])".format(len(self.genes)-1)) + python_program = python_program.format(operator_def_str, constants_str, genes_str, + "print(program[{}])".format(len(self.genes)-1)) # return it return python_program diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index b1adc0a..a83c00b 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -31,7 +31,8 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): """ -# TODO: Should we also add a mutate method to the gene itself? +# NOTE: Should we also add a mutate method to the gene itself? Considering that we are doing the mutation by doing +# a crossover of the whole chromosome with a new random chromosome, I don't think there is any benefit. class VariableGene(object): diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py new file mode 100644 index 0000000..4a0060d --- /dev/null +++ b/mep/genetics/operator.py @@ -0,0 +1,103 @@ +from abc import ABCMeta, abstractmethod + + +class Operator(object): + """ + This is more of a function than a traditional "operator" but the function could be simply using an operator + like "+", "-", etc. At it's core these are indivisible functions that take arguments (i.e. preceding genes) and + output some value. + """ + __metaclass__ = ABCMeta + + @abstractmethod + def __call__(self, *args, **kwargs): + """ + Run the operation/function and return the result. + """ + + @abstractmethod + def function_name(self): + """ + Return the name of the function for use in the pretty print and the python program. + """ + + @abstractmethod + def function_python_definition(self): + """ + Return the python definition of the function + """ + + +# TODO: Consolidate these into just one Operator? +class AdditionOperator(Operator): + """ + Perform addition. + """ + # NOTE: there is no need to support more than two arguments as these can be chained across multiple genes + + def __call__(self, *args, **kwargs): + """ + Perform addition. + """ + return sum(args) + + def function_name(self): + return "add" + + def function_python_definition(self): + return """ +def add(x, y): + return x + y + """ + + +class MultiplicationOperator(Operator): + """ + Perform multiplication + """ + # NOTE: there is no need to support more than two arguments as these can be chained across multiple genes + + def __call__(self, *args, **kwargs): + """ + Perform subtraction. + """ + result = 1 + for arg in args: + result *= arg + + return result + + def function_name(self): + return "multiplication" + + def function_python_definition(self): + return """ +def multiplication(x, y): + return x * y + """ + + +class SubtractionOperator(Operator): + """ + Perform subtraction. + """ + # NOTE: there is no need to support more than two arguments as these can be chained across multiple genes + + def __call__(self, *args, **kwargs): + """ + Perform subtraction. + """ + result = args[0] + for arg in args[1:]: + result -= arg + + return result + + def function_name(self): + return "subtraction" + + def function_python_definition(self): + return """ +def subtraction(x, y): + return x - y + """ \ No newline at end of file From e38fffc53ca68ebfda8ae673529307e4907dc12d Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 13 Jan 2018 15:04:09 -0500 Subject: [PATCH 22/51] Fix the test_chromosome and add initial operator test. --- mep/genetics/gene.py | 4 +++- mep/genetics/operator.py | 1 + tests/mep/genetics/test_chromosome.py | 2 +- tests/mep/genetics/test_operator.py | 15 +++++++++++++++ 4 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 tests/mep/genetics/test_operator.py diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index a83c00b..fa9d12d 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -186,4 +186,6 @@ def __repr__(self): def __eq__(self, other): if other is None or not isinstance(other, OperatorGene): return False - return self.operation == other.operation and self.address1 == other.address1 and self.address2 == other.address2 + + # NOTE: the operators are the same if they are of the same type + return isinstance(self.operation, type(other.operation)) and self.address1 == other.address1 and self.address2 == other.address2 diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py index 4a0060d..c0f5b41 100644 --- a/mep/genetics/operator.py +++ b/mep/genetics/operator.py @@ -1,6 +1,7 @@ from abc import ABCMeta, abstractmethod +# TODO: add some more interesting operators; example pow(...), log(...), exp(...), min(...), max(...) class Operator(object): """ This is more of a function than a traditional "operator" but the function could be simply using an operator diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py index f498274..15081a1 100644 --- a/tests/mep/genetics/test_chromosome.py +++ b/tests/mep/genetics/test_chromosome.py @@ -50,7 +50,7 @@ def test_basic_random_construction(self): self.assertEquals(VariableGene(0, is_feature=False), chromosome.genes[0]) # the 2nd gene can be a variable or an operator; in this case it is the below - self.assertEquals(OperatorGene(Chromosome.operator_lambdas[1], 0, 0), chromosome.genes[1]) + self.assertEquals(OperatorGene(Chromosome.operators_family[1](), 0, 0), chromosome.genes[1]) # verify constant self.assertAlmostEquals(8.599796663725433, chromosome.constants[0]) diff --git a/tests/mep/genetics/test_operator.py b/tests/mep/genetics/test_operator.py new file mode 100644 index 0000000..04133a5 --- /dev/null +++ b/tests/mep/genetics/test_operator.py @@ -0,0 +1,15 @@ +import unittest +from mep.genetics.operator import MultiplicationOperator + + +class TestOperators(unittest.TestCase): + """ + Test the Operator classes + """ + + def test_multiplication_operator(self): + """ + """ + # construct the oeprator + operator = MultiplicationOperator() + self.assertEquals(5 * 2, operator(5, 2)) \ No newline at end of file From 27023d02c187b35d541652d77092e361eb1f2701 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 13 Jan 2018 15:08:17 -0500 Subject: [PATCH 23/51] Test the other operators. --- tests/mep/genetics/test_operator.py | 35 ++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/tests/mep/genetics/test_operator.py b/tests/mep/genetics/test_operator.py index 04133a5..bb5f630 100644 --- a/tests/mep/genetics/test_operator.py +++ b/tests/mep/genetics/test_operator.py @@ -1,5 +1,5 @@ import unittest -from mep.genetics.operator import MultiplicationOperator +from mep.genetics.operator import MultiplicationOperator, AdditionOperator, SubtractionOperator class TestOperators(unittest.TestCase): @@ -10,6 +10,35 @@ class TestOperators(unittest.TestCase): def test_multiplication_operator(self): """ """ - # construct the oeprator + # construct the operator and test it operator = MultiplicationOperator() - self.assertEquals(5 * 2, operator(5, 2)) \ No newline at end of file + self.assertEquals(5 * 2, operator(5, 2)) + self.assertEquals("multiplication", operator.function_name()) + self.assertEquals(""" +def multiplication(x, y): + return x * y + """, operator.function_python_definition()) + + def test_addition_operator(self): + """ + """ + # construct the operator and test it + operator = AdditionOperator() + self.assertEquals(5 + 2, operator(5, 2)) + self.assertEquals("add", operator.function_name()) + self.assertEquals(""" +def add(x, y): + return x + y + """, operator.function_python_definition()) + + def test_subtraction_operator(self): + """ + """ + # construct the operator and test it + operator = SubtractionOperator() + self.assertEquals(5 - 2, operator(5, 2)) + self.assertEquals("subtraction", operator.function_name()) + self.assertEquals(""" +def subtraction(x, y): + return x - y + """, operator.function_python_definition()) \ No newline at end of file From b5116a1b140ba9ffae1806c4b2e34fe49f10a1ec Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Sat, 13 Jan 2018 15:17:31 -0500 Subject: [PATCH 24/51] This adds the min and max operators. --- mep/genetics/chromosome.py | 5 ++- mep/genetics/operator.py | 44 ++++++++++++++++++++++++++- tests/mep/genetics/test_chromosome.py | 2 +- tests/mep/genetics/test_operator.py | 25 +++++++++++++++ 4 files changed, 73 insertions(+), 3 deletions(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index c7d5738..e93230b 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -3,6 +3,7 @@ from collections import deque from mep.genetics.gene import Gene, VariableGene, OperatorGene from mep.genetics.operator import AdditionOperator, MultiplicationOperator, SubtractionOperator +from mep.genetics.operator import MinOperator, MaxOperator from random import random, randint, choice @@ -16,7 +17,9 @@ class Chromosome(object): # valid operators operators_family = [AdditionOperator, MultiplicationOperator, - SubtractionOperator] + SubtractionOperator, + MinOperator, + MaxOperator] def __init__(self, genes, constants): """ diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py index c0f5b41..4c4e0fd 100644 --- a/mep/genetics/operator.py +++ b/mep/genetics/operator.py @@ -1,7 +1,7 @@ from abc import ABCMeta, abstractmethod -# TODO: add some more interesting operators; example pow(...), log(...), exp(...), min(...), max(...) +# TODO: add some more interesting operators; example pow(...), log(...), exp(...) class Operator(object): """ This is more of a function than a traditional "operator" but the function could be simply using an operator @@ -101,4 +101,46 @@ def function_python_definition(self): return """ def subtraction(x, y): return x - y + """ + + +class MinOperator(Operator): + """ + Perform the Min operation. + """ + + def __call__(self, *args, **kwargs): + """ + Perform min + """ + return min(args) + + def function_name(self): + return "min_" + + def function_python_definition(self): + return """ +def min_(x, y): + return min(x, y) + """ + + +class MaxOperator(Operator): + """ + Perform the Max operation. + """ + + def __call__(self, *args, **kwargs): + """ + Perform max + """ + return max(args) + + def function_name(self): + return "max_" + + def function_python_definition(self): + return """ +def max_(x, y): + return max(x, y) """ \ No newline at end of file diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py index 15081a1..ac458f7 100644 --- a/tests/mep/genetics/test_chromosome.py +++ b/tests/mep/genetics/test_chromosome.py @@ -50,7 +50,7 @@ def test_basic_random_construction(self): self.assertEquals(VariableGene(0, is_feature=False), chromosome.genes[0]) # the 2nd gene can be a variable or an operator; in this case it is the below - self.assertEquals(OperatorGene(Chromosome.operators_family[1](), 0, 0), chromosome.genes[1]) + self.assertEquals(OperatorGene(Chromosome.operators_family[2](), 0, 0), chromosome.genes[1]) # verify constant self.assertAlmostEquals(8.599796663725433, chromosome.constants[0]) diff --git a/tests/mep/genetics/test_operator.py b/tests/mep/genetics/test_operator.py index bb5f630..925a715 100644 --- a/tests/mep/genetics/test_operator.py +++ b/tests/mep/genetics/test_operator.py @@ -1,5 +1,6 @@ import unittest from mep.genetics.operator import MultiplicationOperator, AdditionOperator, SubtractionOperator +from mep.genetics.operator import MinOperator, MaxOperator class TestOperators(unittest.TestCase): @@ -41,4 +42,28 @@ def test_subtraction_operator(self): self.assertEquals(""" def subtraction(x, y): return x - y + """, operator.function_python_definition()) + + def test_min_operator(self): + """ + """ + # construct the operator and test it + operator = MinOperator() + self.assertEquals(min(5, 2), operator(5, 2)) + self.assertEquals("min_", operator.function_name()) + self.assertEquals(""" +def min_(x, y): + return min(x, y) + """, operator.function_python_definition()) + + def test_max_operator(self): + """ + """ + # construct the operator and test it + operator = MaxOperator() + self.assertEquals(max(5, 2), operator(5, 2)) + self.assertEquals("max_", operator.function_name()) + self.assertEquals(""" +def max_(x, y): + return max(x, y) """, operator.function_python_definition()) \ No newline at end of file From c9e2e09a0a4df37910a8dc65d0b631bccb0196e1 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Wed, 4 Jul 2018 14:13:41 -0400 Subject: [PATCH 25/51] Error check usage. --- mep/main.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mep/main.py b/mep/main.py index e3528ab..6e7e089 100644 --- a/mep/main.py +++ b/mep/main.py @@ -7,7 +7,13 @@ from mep.genetics.population import Population if __name__ == "__main__": - # TODO: error check usage + if len(sys.argv) != 3: + print("ERROR: Expected usage 'python -m mep.main DATA_SET_NAME PYTHON_FILE_NAME'\n" + + " DATA_SET_NAME: The name (full path) to the data file to train on.\n" + " PYTHON_FILE_NAME: The name (full path) to the python file to write the output program.\n" + "Example: 'python -m mep.main datasets/data1.csv test.py'" + ) + sys.exit(-1) # get the data file data_set_name = sys.argv[1] @@ -69,7 +75,6 @@ logger.debug(python_program) python_file.write(python_program) - # TODO: Convert the output to a valid python program # TODO: Add support for classification # TODO: Add example digital circuit test # TODO: Add UDFs From 1a1adfd499f39938571883e487567f1d6f357885 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Wed, 4 Jul 2018 14:27:51 -0400 Subject: [PATCH 26/51] Example data using the max(..) function. --- datasets/data4.csv | 51 ++++++++++++++++++++++++++++++++++++++++++++++ datasets/files.txt | 1 + 2 files changed, 52 insertions(+) create mode 100644 datasets/data4.csv diff --git a/datasets/data4.csv b/datasets/data4.csv new file mode 100644 index 0000000..7b48557 --- /dev/null +++ b/datasets/data4.csv @@ -0,0 +1,51 @@ +x1,x2,x3,target +93.0,16.0,-85.0,-7905.0 +-49.0,-5.0,16.0,-80.0 +-41.0,-21.0,79.0,-1659.0 +27.0,81.0,-63.0,-5103.0 +-9.0,-64.0,84.0,-756.0 +81.0,21.0,5.0,405.0 +-20.0,60.0,-75.0,-4500.0 +93.0,-35.0,53.0,4929.0 +0.0,-29.0,80.0,0.0 +-19.0,4.0,-33.0,-132.0 +-88.0,15.0,91.0,1365.0 +-57.0,-100.0,28.0,-1596.0 +91.0,-45.0,-36.0,-3276.0 +-50.0,19.0,-87.0,-1653.0 +-45.0,100.0,48.0,4800.0 +-32.0,22.0,-21.0,-462.0 +59.0,49.0,-46.0,-2714.0 +-40.0,-92.0,5.0,-200.0 +32.0,59.0,-85.0,-5015.0 +-94.0,-29.0,-8.0,232.0 +-21.0,-68.0,-26.0,546.0 +-25.0,26.0,81.0,2106.0 +12.0,25.0,-85.0,-2125.0 +-11.0,40.0,-57.0,-2280.0 +51.0,73.0,7.0,511.0 +100.0,-77.0,-43.0,-4300.0 +-74.0,-35.0,21.0,-735.0 +-34.0,-90.0,-14.0,476.0 +-84.0,-2.0,8.0,-16.0 +-41.0,92.0,-7.0,-644.0 +57.0,-85.0,65.0,3705.0 +38.0,55.0,71.0,3905.0 +-20.0,17.0,-100.0,-1700.0 +-42.0,40.0,-51.0,-2040.0 +-98.0,-52.0,-84.0,4368.0 +80.0,42.0,-4.0,-320.0 +63.0,-32.0,-46.0,-2898.0 +-3.0,-76.0,-19.0,57.0 +-37.0,20.0,-76.0,-1520.0 +-92.0,-24.0,-83.0,1992.0 +-23.0,-64.0,-89.0,2047.0 +94.0,-84.0,65.0,6110.0 +-54.0,3.0,-76.0,-228.0 +59.0,-61.0,49.0,2891.0 +33.0,-51.0,-32.0,-1056.0 +30.0,-22.0,46.0,1380.0 +2.0,36.0,0.0,0.0 +4.0,87.0,-88.0,-7656.0 +61.0,-13.0,-32.0,-1952.0 +46.0,13.0,-76.0,-3496.0 diff --git a/datasets/files.txt b/datasets/files.txt index cbb59c7..482a2ae 100644 --- a/datasets/files.txt +++ b/datasets/files.txt @@ -1,3 +1,4 @@ data1.csv is f(x_1, x_2) = x_1 + x_2 data2.csv is f(x_1, x_2) = x_1 * x_2 + x_3 data3.csv is f(x_1, x_2, x_3) = x_1 * x_1 + x_2 - x_3 +data4.csv is f(x_1, x_2, x_3) = max(x_1, x_2) * x_3 \ No newline at end of file From 1ef480460b712608de13b20053d32b713b309731 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Wed, 4 Jul 2018 15:35:49 -0400 Subject: [PATCH 27/51] Refactoring to fit into a more scikit-learn approach. --- mep/main.py | 39 +++++----------------- mep/model.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 30 deletions(-) create mode 100644 mep/model.py diff --git a/mep/main.py b/mep/main.py index 6e7e089..d91913f 100644 --- a/mep/main.py +++ b/mep/main.py @@ -4,7 +4,7 @@ import logging import os from dataset import DataSet -from mep.genetics.population import Population +from mep.model import MEPModel if __name__ == "__main__": if len(sys.argv) != 3: @@ -38,39 +38,18 @@ logger = logging.getLogger("main") logger.info("Starting up...") - # construct a population and run it for the number of generations specified - population = Population(data_set.data_matrix, data_set.target, int(config["num_constants"]), - float(config["constants_min"]), float(config["constants_max"]), - float(config["feature_variables_probability"]), - int(config["code_length"]), int(config["population_size"]), - float(config["operators_probability"])) - population.initialize() - - # iterate through the generations - best_chromosome = None - for generation in range(int(config["num_generations"])): - best_chromosome = population.chromosomes[0] - logger.debug("Generation number {} best chromosome error {} with {} chromosomes ".format( - generation, best_chromosome.error, len(population.chromosomes))) - - if best_chromosome.error == 0: - logger.debug("Exiting early as we have hit the best possible error.") - break - population.next_generation() - - logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error, - best_chromosome.pretty_string())) - - # TODO: this should probably be optional - # prune out the unused genes - best_chromosome.prune() - logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format(best_chromosome.error, - best_chromosome.pretty_string())) + # configure the model; then fit it to the training data + model = MEPModel(int(config["num_constants"]), float(config["constants_min"]), float(config["constants_max"]), + float(config["feature_variables_probability"]), int(config["code_length"]), + int(config["population_size"]), float(config["operators_probability"]), + int(config["num_generations"])) + model.fit(data_set.data_matrix, data_set.target) + logger.info("Finished fitting the model") # TODO: Optional? # we then convert the chromosome into a valid python program and write it out to file with open(python_file_name, 'w') as python_file: - python_program = best_chromosome.to_python() + python_program = model.to_python() logger.debug("Write out the python program to {}".format(python_file_name)) logger.debug(python_program) python_file.write(python_program) diff --git a/mep/model.py b/mep/model.py new file mode 100644 index 0000000..973032b --- /dev/null +++ b/mep/model.py @@ -0,0 +1,93 @@ +import logging +from mep.genetics.population import Population + + +# NOTE: The idea is to explicitly conform to a scikit-learn type of approach where we can run fit(..) and +# predict(..) methods on the model +class MEPModel(object): + """ + Encapsulate the MEP model. + """ + + def __init__(self, num_constants, constants_min, constants_max, feature_variables_probability, code_length, + population_size, operators_probability, num_generations): + + """ + Initialize. + :param num_constants: + :param constants_min: + :param constants_max: + :param feature_variables_probability: + :param code_length: + :param population_size: + :param operators_probability: + :param num_generations: + """ + # logger + self.logger = logging.getLogger(self.__class__.__name__) + + # core parameters + self.num_constants = num_constants + self.constants_min = constants_min + self.constants_max = constants_max + self.feature_variables_probability = feature_variables_probability + self.code_length = code_length + self.population_size = population_size + self.operators_probability = operators_probability + self.num_generations = num_generations + + # the best found chromosome from the evolution process + self.best_chromosome = None + + def fit(self, X, y): + """ + + :param X: + :param y: + :return: + """ + # construct a population and run it for the number of generations specified + population = Population(X, y, self.num_constants, + self.constants_min, self.constants_max, + self.feature_variables_probability, + self.code_length, self.population_size, + self.operators_probability) + population.initialize() + + # iterate through the generations + for generation in range(self.num_generations): + self.best_chromosome = population.chromosomes[0] + self.logger.debug("Generation number {} best chromosome error {} with {} chromosomes ".format( + generation, self.best_chromosome.error, len(population.chromosomes))) + + if self.best_chromosome.error == 0: + self.logger.debug("Exiting early as we have hit the best possible error.") + break + population.next_generation() + + self.logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format( + self.best_chromosome.error, self.best_chromosome.pretty_string())) + + # prune out the unused genes + self.best_chromosome.prune() + + def predict(self, X): + """ + + :param X: + :return: + """ + # TODO: Fill in logic + pass + + # NOTE: These are NOT scikit-learn methods now + def to_python(self): + """ + Return a python program which can run the model directly via direct inputs. + :return: the python program (string) + :rtype: str + """ + if self.best_chromosome is None: + raise ValueError("The model hasn't been fit.") + + return self.best_chromosome.to_python() From 61a320342bcfbb983d614b6a6cc8b21a47e681c6 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Fri, 6 Jul 2018 17:43:25 -0400 Subject: [PATCH 28/51] Adds the predict logic. --- mep/genetics/chromosome.py | 23 +++++++++++++++++++++++ mep/model.py | 20 +++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index e93230b..83e924f 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -113,6 +113,26 @@ def evaluate(self, data_matrix, targets): self.error = error self.best_gene_index = gene_index + def predict(self, data_matrix): + """ + Return the predictions for this data. + :param data_matrix: the sample data; matrix with (n_samples, n_features) + :type data_matrix: np.matrix + :return: the prediction for each sample; array-like (n_samples) length + :rtype: np.array + """ + # NOTE: This is almost identical to evaluate except that we are running after we have done the fit so we have + # already determined the best gene index and we just want to calculate the values; no error calc + num_examples = data_matrix.shape[0] + eval_matrix = np.zeros((len(self.genes), num_examples)) + dummy_targets = [0] * num_examples + for gene_index, gene in enumerate(self.genes): + # compute the error for this gene; if it is the best we have found then update + gene.evaluate(gene_index, eval_matrix, data_matrix, self.constants, dummy_targets) + if self.best_gene_index == gene_index: + # extract from the eval_matrix; these from this gene (line in program) for each of the examples + return eval_matrix[gene_index, :] + def mutate(self, gene_mutation_prob, num_constants, constants_min, constants_max, constants_prob, feature_variable_prob, num_feature_variables, num_genes, operators_prob): """ @@ -238,6 +258,9 @@ def prune(self): gene.address1 = gene_indices_in_use.index(gene.address1) gene.address2 = gene_indices_in_use.index(gene.address2) + # the now "best gene" is just the last one + self.best_gene_index = len(self.genes) - 1 + def to_python(self): """ Convert to python program string. diff --git a/mep/model.py b/mep/model.py index 973032b..7a3d529 100644 --- a/mep/model.py +++ b/mep/model.py @@ -73,11 +73,29 @@ def fit(self, X, y): def predict(self, X): """ + Return the predictions for this data. + :param X: the sample data; matrix with (n_samples, n_features) + :type X: np.matrix + :return: the prediction for each sample; array-like (n_samples) length + :rtype: np.array + """ + return self.best_chromosome.predict(X) + + def score(self, X): + """ + Returns the coefficient of determination R^2 of the prediction. + + The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares + ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). + The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). + A constant model that always predicts the expected value of y, disregarding the input features, would get a + R^2 score of 0.0. + (NOTE: Comment taken from scikit-learn.) :param X: :return: """ - # TODO: Fill in logic + # TODO: pass # NOTE: These are NOT scikit-learn methods now From a7804d4d86c85e41c60d730fa7ead177ece95bb2 Mon Sep 17 00:00:00 2001 From: Paul Jacobs Date: Fri, 6 Jul 2018 20:12:32 -0400 Subject: [PATCH 29/51] Populates the score method of the model. --- mep/model.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/mep/model.py b/mep/model.py index 7a3d529..915c998 100644 --- a/mep/model.py +++ b/mep/model.py @@ -81,7 +81,7 @@ def predict(self, X): """ return self.best_chromosome.predict(X) - def score(self, X): + def score(self, X, y): """ Returns the coefficient of determination R^2 of the prediction. @@ -92,11 +92,18 @@ def score(self, X): R^2 score of 0.0. (NOTE: Comment taken from scikit-learn.) - :param X: - :return: + :param X: the sample data; matrix with (n_samples, n_features) + :type X: np.matrix + :param y: the target values + :type y: array-like, shape = (n_samples) + :return: the score + :rtype: float """ - # TODO: - pass + y_pred = self.predict(X) + u = ((y - y_pred) ** 2).sum() + v = ((y - y.mean()) ** 2).sum() + + return 1 - u/v # NOTE: These are NOT scikit-learn methods now def to_python(self): From ecfee48046d626b8cabaf5d051fcde24ac8518a9 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Wed, 17 Oct 2018 17:50:38 +0000 Subject: [PATCH 30/51] a basic test of the model --- mep/model.py | 8 +++--- tests/mep/genetics/test_model.py | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) create mode 100644 tests/mep/genetics/test_model.py diff --git a/mep/model.py b/mep/model.py index 915c998..a44fa30 100644 --- a/mep/model.py +++ b/mep/model.py @@ -41,10 +41,10 @@ def __init__(self, num_constants, constants_min, constants_max, feature_variable def fit(self, X, y): """ - - :param X: - :param y: - :return: + Fit the model. Given the feature vectors in matrix 'X' and the target vector 'y' we fit our model. + :param X: the feature matrix (training data) + :param y: the target values + :return: nothing """ # construct a population and run it for the number of generations specified population = Population(X, y, self.num_constants, diff --git a/tests/mep/genetics/test_model.py b/tests/mep/genetics/test_model.py new file mode 100644 index 0000000..fae9833 --- /dev/null +++ b/tests/mep/genetics/test_model.py @@ -0,0 +1,47 @@ +import unittest +from mep.model import MEPModel +import random +import numpy as np + +# make reproducible +random.seed(1) + + +class TestModel(unittest.TestCase): + """ + Test the model. + """ + + def test_model_basic(self): + model = MEPModel(num_constants=2, constants_min=-1, constants_max=1, + feature_variables_probability=0.4, code_length=50, + population_size=100, operators_probability=0.5, + num_generations=200) + + # generate data from this function + def function_to_learn(x1, x2): + return x1 + x2 + training_feature_matrix = [] + training_target_vector = [] + for sample in range(100): + x1, x2 = random.randint(-100, 100), random.randint(-100, 100) + val = function_to_learn(x1, x2) + training_feature_matrix.append([x1, x2]) + training_target_vector.append(val) + + # fit the model + model.fit(np.matrix(training_feature_matrix), np.array(training_target_vector)) + + # test data + def function_to_learn(x1, x2): + return x1 + x2 + test_feature_matrix = [] + test_target_vector = [] + for sample in range(100): + x1, x2 = random.randint(-100, 100), random.randint(-100, 100) + val = function_to_learn(x1, x2) + test_feature_matrix.append([x1, x2]) + test_target_vector.append(val) + + self.assertEquals(model.score(np.matrix(training_feature_matrix), np.array(training_target_vector)), 1) + From a4209a0d937a1a34ffd4632aac17258eff092417 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Wed, 17 Oct 2018 18:44:50 +0000 Subject: [PATCH 31/51] more complex functions to learn --- tests/mep/genetics/test_model.py | 84 ++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/tests/mep/genetics/test_model.py b/tests/mep/genetics/test_model.py index fae9833..52fc8d2 100644 --- a/tests/mep/genetics/test_model.py +++ b/tests/mep/genetics/test_model.py @@ -2,16 +2,42 @@ from mep.model import MEPModel import random import numpy as np +import logging +import datetime as dt # make reproducible random.seed(1) +logging.basicConfig(filename="logs/TEST_{}.log".format(dt.datetime.now().strftime("%Y%m%d")), + level=logging.DEBUG, + filemode='w', + format="%(asctime)s %(name)s %(funcName)s %(levelname)s %(message)s") +logger = logging.getLogger("main") + class TestModel(unittest.TestCase): """ Test the model. """ + def _generate_train_and_test(self, function_to_learn, num_samples, num_args): + training_feature_matrix = [] + training_target_vector = [] + for sample in range(num_samples): + args = [random.randint(-250, 250) for _ in range(num_args)] + training_feature_matrix.append(args) + training_target_vector.append(function_to_learn(*args)) + + test_feature_matrix = [] + test_target_vector = [] + for sample in range(num_samples): + args = [random.randint(-250, 250) for _ in range(num_args)] + test_feature_matrix.append(args) + test_target_vector.append(function_to_learn(*args)) + + return np.matrix(training_feature_matrix), np.array(training_target_vector), \ + np.matrix(test_feature_matrix), np.array(test_target_vector) + def test_model_basic(self): model = MEPModel(num_constants=2, constants_min=-1, constants_max=1, feature_variables_probability=0.4, code_length=50, @@ -21,27 +47,47 @@ def test_model_basic(self): # generate data from this function def function_to_learn(x1, x2): return x1 + x2 - training_feature_matrix = [] - training_target_vector = [] - for sample in range(100): - x1, x2 = random.randint(-100, 100), random.randint(-100, 100) - val = function_to_learn(x1, x2) - training_feature_matrix.append([x1, x2]) - training_target_vector.append(val) + + training_feature_matrix, training_target_vector, test_feature_matrix, test_target_vector = self._generate_train_and_test( + function_to_learn, 100, 2) # fit the model - model.fit(np.matrix(training_feature_matrix), np.array(training_target_vector)) + model.fit(training_feature_matrix, training_target_vector) - # test data - def function_to_learn(x1, x2): - return x1 + x2 - test_feature_matrix = [] - test_target_vector = [] - for sample in range(100): - x1, x2 = random.randint(-100, 100), random.randint(-100, 100) - val = function_to_learn(x1, x2) - test_feature_matrix.append([x1, x2]) - test_target_vector.append(val) + self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1) + + def test_model_min_max(self): + model = MEPModel(num_constants=2, constants_min=-1, constants_max=1, + feature_variables_probability=0.4, code_length=50, + population_size=100, operators_probability=0.7, + num_generations=200) + + # generate data from this function + def function_to_learn(x1, x2, x3, x4): + return min(x1, x2) + max(x3, x4) + + training_feature_matrix, training_target_vector, test_feature_matrix, test_target_vector = self._generate_train_and_test( + function_to_learn, 100, 4) - self.assertEquals(model.score(np.matrix(training_feature_matrix), np.array(training_target_vector)), 1) + # fit the model + model.fit(training_feature_matrix, training_target_vector) + + self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1) + + def test_model_pow(self): + model = MEPModel(num_constants=2, constants_min=-1, constants_max=1, + feature_variables_probability=0.4, code_length=50, + population_size=100, operators_probability=0.5, + num_generations=200) + + # generate data from this function + def function_to_learn(x1, x2, x3, x4): + return x1 * x2 + x2 * x2 + x3 + + training_feature_matrix, training_target_vector, test_feature_matrix, test_target_vector = self._generate_train_and_test( + function_to_learn, 100, 4) + + # fit the model + model.fit(training_feature_matrix, training_target_vector) + self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1) From 98858b0d5e012779223b260dd737d5967876321d Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 18:50:04 +0000 Subject: [PATCH 32/51] requirements file --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..70f1d2f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +numpy==1.17 +pandas==0.25 \ No newline at end of file From a029a0811f065878045a41392c744e332641c6a3 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 18:50:29 +0000 Subject: [PATCH 33/51] fix import --- mep/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mep/main.py b/mep/main.py index d91913f..5c04b1e 100644 --- a/mep/main.py +++ b/mep/main.py @@ -3,7 +3,7 @@ import json import logging import os -from dataset import DataSet +from mep.dataset import DataSet from mep.model import MEPModel if __name__ == "__main__": From a0dbf2e4ea473651996b45006b61e6080f69d22a Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 18:53:43 +0000 Subject: [PATCH 34/51] pip approach --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 64312ca..9486b32 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,13 @@ conda env create -f environment.yaml source activate py-mep-dev ``` +Or using `pip` we could do: + +``` +virtualenv -p python3 .venv +pip install -r requirements.txt +``` + Example, running with a dataset `python -m mep.main datasets/data1.csv test.py`. This will run a full MEP population evolution to solve the problem specified in the data CSV, determine the best chromosome, prune it, and then convert that chromosome into a functioning python program that can be run by passing in the feature inputs. Example, `python test.py 5 10`. An example Python program evolved to solve the addition problem of adding together two features (ex: datasets/data1.csv): From 81411cbed7662ad1c63913b242aeeb65d0229bbe Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:21:27 +0000 Subject: [PATCH 35/51] drop old fashioned (object) inheritance --- mep/dataset.py | 2 +- mep/genetics/chromosome.py | 2 +- mep/genetics/gene.py | 2 +- mep/genetics/operator.py | 2 +- mep/genetics/population.py | 2 +- mep/model.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mep/dataset.py b/mep/dataset.py index 4f09241..2962c0d 100644 --- a/mep/dataset.py +++ b/mep/dataset.py @@ -1,7 +1,7 @@ import pandas as pd -class DataSet(object): +class DataSet: """ Encapsulate a data set. Feature vectors and their targets. """ diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index 83e924f..5e8e37d 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -7,7 +7,7 @@ from random import random, randint, choice -class Chromosome(object): +class Chromosome: """ Level above Gene. Each chromosome is a fixed number of genes and constants. We can think of a chromosome as a program where each gene is a line of code in the program. Genes can reference the result of other genes by their diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index fa9d12d..4672eed 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -118,7 +118,7 @@ def __eq__(self, other): return self.index == other.index and self.is_feature == other.is_feature -class OperatorGene(object): +class OperatorGene: """ This gene performance an operation on two addresses. The addresses are indices in the eval_matrix -- i.e. from the evaluation of other genes before this one. diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py index 4c4e0fd..16f38be 100644 --- a/mep/genetics/operator.py +++ b/mep/genetics/operator.py @@ -2,7 +2,7 @@ # TODO: add some more interesting operators; example pow(...), log(...), exp(...) -class Operator(object): +class Operator: """ This is more of a function than a traditional "operator" but the function could be simply using an operator like "+", "-", etc. At it's core these are indivisible functions that take arguments (i.e. preceding genes) and diff --git a/mep/genetics/population.py b/mep/genetics/population.py index dd60592..97c4d8d 100644 --- a/mep/genetics/population.py +++ b/mep/genetics/population.py @@ -3,7 +3,7 @@ import copy -class Population(object): +class Population: """ A collection of chromosomes. """ diff --git a/mep/model.py b/mep/model.py index a44fa30..6cbc3f3 100644 --- a/mep/model.py +++ b/mep/model.py @@ -4,7 +4,7 @@ # NOTE: The idea is to explicitly conform to a scikit-learn type of approach where we can run fit(..) and # predict(..) methods on the model -class MEPModel(object): +class MEPModel: """ Encapsulate the MEP model. """ From 9c5ebf8e3880cb3c8266eeef302f7a13f9b9a223 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:28:59 +0000 Subject: [PATCH 36/51] type hinting --- mep/model.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mep/model.py b/mep/model.py index 6cbc3f3..52fbf7b 100644 --- a/mep/model.py +++ b/mep/model.py @@ -1,4 +1,5 @@ import logging +import numpy as np from mep.genetics.population import Population @@ -9,8 +10,9 @@ class MEPModel: Encapsulate the MEP model. """ - def __init__(self, num_constants, constants_min, constants_max, feature_variables_probability, code_length, - population_size, operators_probability, num_generations): + def __init__(self, num_constants: int, constants_min: float, constants_max: float, + feature_variables_probability: float, code_length: int, population_size: int, + operators_probability: float, num_generations: int): """ Initialize. @@ -39,7 +41,7 @@ def __init__(self, num_constants, constants_min, constants_max, feature_variable # the best found chromosome from the evolution process self.best_chromosome = None - def fit(self, X, y): + def fit(self, X: np.ndarray, y: np.ndarray): """ Fit the model. Given the feature vectors in matrix 'X' and the target vector 'y' we fit our model. :param X: the feature matrix (training data) @@ -71,17 +73,15 @@ def fit(self, X, y): # prune out the unused genes self.best_chromosome.prune() - def predict(self, X): + def predict(self, X: np.ndarray) -> np.ndarray: """ Return the predictions for this data. :param X: the sample data; matrix with (n_samples, n_features) - :type X: np.matrix :return: the prediction for each sample; array-like (n_samples) length - :rtype: np.array """ return self.best_chromosome.predict(X) - def score(self, X, y): + def score(self, X: np.ndarray, y: np.ndarray) -> float: """ Returns the coefficient of determination R^2 of the prediction. @@ -93,11 +93,9 @@ def score(self, X, y): (NOTE: Comment taken from scikit-learn.) :param X: the sample data; matrix with (n_samples, n_features) - :type X: np.matrix :param y: the target values :type y: array-like, shape = (n_samples) :return: the score - :rtype: float """ y_pred = self.predict(X) u = ((y - y_pred) ** 2).sum() From 837302afb64a27df64b5c86891f231b93cc3b1a8 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:29:18 +0000 Subject: [PATCH 37/51] use a consistent output log dir --- tests/mep/genetics/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mep/genetics/test_model.py b/tests/mep/genetics/test_model.py index 52fc8d2..2859262 100644 --- a/tests/mep/genetics/test_model.py +++ b/tests/mep/genetics/test_model.py @@ -8,7 +8,7 @@ # make reproducible random.seed(1) -logging.basicConfig(filename="logs/TEST_{}.log".format(dt.datetime.now().strftime("%Y%m%d")), +logging.basicConfig(filename="output_logs/TEST_{}.log".format(dt.datetime.now().strftime("%Y%m%d")), level=logging.DEBUG, filemode='w', format="%(asctime)s %(name)s %(funcName)s %(levelname)s %(message)s") From 09a092e2e371237d9e3edbbe6584c33403f017a3 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:29:41 +0000 Subject: [PATCH 38/51] pytest is needed for the ./test.sh --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 70f1d2f..3fa8759 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ numpy==1.17 -pandas==0.25 \ No newline at end of file +pandas==0.25 +pytest==5.2.1 \ No newline at end of file From 7ea7e5ec313ae659a82fef0705de8655f2ad453d Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:30:58 +0000 Subject: [PATCH 39/51] explicit reference to the test dir --- test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test.sh b/test.sh index be79cbe..cd7b35b 100755 --- a/test.sh +++ b/test.sh @@ -1,2 +1,2 @@ #!/bin/bash -py.test \ No newline at end of file +py.test tests/ From 8802f43d553789651c1c8e5164991568a3e3bc6b Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:34:56 +0000 Subject: [PATCH 40/51] use python 3+ abstract classes and genes need to inherit Gene --- mep/genetics/gene.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index 4672eed..7e07d87 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -3,11 +3,10 @@ from abc import ABCMeta, abstractmethod -class Gene(object): +class Gene(metaclass=ABCMeta): """ Lowest level of the genetic structure of MEP. Think of this as one line of code in the program. """ - __metaclass__ = ABCMeta @abstractmethod def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): @@ -35,7 +34,7 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): # a crossover of the whole chromosome with a new random chromosome, I don't think there is any benefit. -class VariableGene(object): +class VariableGene(Gene): """ This gene is simply a variable. Either a constant or one of the features in the data -- i.e. an input variable. """ @@ -118,7 +117,7 @@ def __eq__(self, other): return self.index == other.index and self.is_feature == other.is_feature -class OperatorGene: +class OperatorGene(Gene): """ This gene performance an operation on two addresses. The addresses are indices in the eval_matrix -- i.e. from the evaluation of other genes before this one. From 348d31cbecefef656ec19035b7caa5f19064d989 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:43:25 +0000 Subject: [PATCH 41/51] use newer abstract class mechanism, better str rep, and type hinting --- mep/genetics/gene.py | 17 +++++++---------- mep/genetics/operator.py | 9 +++++++-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index 7e07d87..0309e09 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -2,6 +2,8 @@ import numpy as np from abc import ABCMeta, abstractmethod +from mep.genetics.operator import Operator + class Gene(metaclass=ABCMeta): """ @@ -9,7 +11,7 @@ class Gene(metaclass=ABCMeta): """ @abstractmethod - def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> float: """ This method will modify the eval_matrix for this gene index for each example in the data_matrix. @@ -39,20 +41,18 @@ class VariableGene(Gene): This gene is simply a variable. Either a constant or one of the features in the data -- i.e. an input variable. """ - def __init__(self, index, is_feature=True): + def __init__(self, index: int, is_feature=True): """ The index into either the feature vector (if "is_feature" is True) or into the constants. :param index: the index into the vector - :type index: int :param is_feature: whether this is a feature variable or a constant - :type is_feature: bool """ # self.logger = logging.getLogger(self.__class__) self.index = index self.is_feature = is_feature - def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> float: """ This method will modify the eval_matrix for this gene index for each example in the data_matrix. @@ -124,15 +124,12 @@ class OperatorGene(Gene): """ # NOTE: This could be expanded to multiple addresses - def __init__(self, operation, address1, address2): + def __init__(self, operation: Operator, address1: int, address2: int): """ Initialize. :param operation: a lambda or function that can be operated on two floats - :type operation: lambda :param address1: index into the eval_matrix - :type address1: int :param address2: index into the eval_matrix - :type address2: int """ # self.logger = logging.getLogger(self.__class__) @@ -140,7 +137,7 @@ def __init__(self, operation, address1, address2): self.address1 = address1 self.address2 = address2 - def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> float: """ This method will modify the eval_matrix for this gene index for each example in the data_matrix. diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py index 16f38be..5790c1a 100644 --- a/mep/genetics/operator.py +++ b/mep/genetics/operator.py @@ -2,13 +2,12 @@ # TODO: add some more interesting operators; example pow(...), log(...), exp(...) -class Operator: +class Operator(metaclass=ABCMeta): """ This is more of a function than a traditional "operator" but the function could be simply using an operator like "+", "-", etc. At it's core these are indivisible functions that take arguments (i.e. preceding genes) and output some value. """ - __metaclass__ = ABCMeta @abstractmethod def __call__(self, *args, **kwargs): @@ -28,6 +27,12 @@ def function_python_definition(self): Return the python definition of the function """ + def __str__(self): + return self.function_name() + + def __repr__(self): + return str(self) + # TODO: Consolidate these into just one Operator? class AdditionOperator(Operator): From 1f2c175d00729902a953513436879b08a0e3baa3 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:44:28 +0000 Subject: [PATCH 42/51] test must have broken with upgrade (change in random seed?) so this fixes it --- tests/mep/genetics/test_chromosome.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py index ac458f7..ba093eb 100644 --- a/tests/mep/genetics/test_chromosome.py +++ b/tests/mep/genetics/test_chromosome.py @@ -43,17 +43,17 @@ def test_basic_random_construction(self): operators_prob=0.5) # confirm the number of genes and constants match what we expect - self.assertEquals(num_genes, len(chromosome.genes)) - self.assertEquals(num_constants, len(chromosome.constants)) + self.assertEqual(num_genes, len(chromosome.genes)) + self.assertEqual(num_constants, len(chromosome.constants)) # the first gene has to be a variable gene; in particular it is this one - self.assertEquals(VariableGene(0, is_feature=False), chromosome.genes[0]) + self.assertEqual(VariableGene(0, is_feature=False), chromosome.genes[0]) # the 2nd gene can be a variable or an operator; in this case it is the below - self.assertEquals(OperatorGene(Chromosome.operators_family[2](), 0, 0), chromosome.genes[1]) + self.assertEqual(OperatorGene(Chromosome.operators_family[4](), 0, 0), chromosome.genes[1]) # verify constant - self.assertAlmostEquals(8.599796663725433, chromosome.constants[0]) + self.assertAlmostEqual(8.599796663725433, chromosome.constants[0]) def test_evaluate(self): """ From a453564cb9a7bf199c5439bd9bd0902227c065a0 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:46:27 +0000 Subject: [PATCH 43/51] drop depreciated assetEqual for assertEquals --- tests/mep/genetics/test_gene.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mep/genetics/test_gene.py b/tests/mep/genetics/test_gene.py index 7b62d12..26c4b8c 100644 --- a/tests/mep/genetics/test_gene.py +++ b/tests/mep/genetics/test_gene.py @@ -34,7 +34,7 @@ def test_basic_constant(self): # run the evaluate error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) - self.assertEquals((1. - 0) + (1. - 0), error) + self.assertEqual((1. - 0) + (1. - 0), error) def test_basic_feature_gene(self): """ @@ -66,7 +66,7 @@ def test_basic_feature_gene(self): # run the evaluate error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) - self.assertEquals((5. - 0.) + (7. - 0.), error) + self.assertEqual((5. - 0.) + (7. - 0.), error) def test_constant_and_feature_gene(self): """ From ffd2a01ce1e4d9f9443611bb87397bff8804cd9e Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 19:55:11 +0000 Subject: [PATCH 44/51] depreciated matrix infavor of array in newer numpy --- tests/mep/genetics/test_gene.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/mep/genetics/test_gene.py b/tests/mep/genetics/test_gene.py index 26c4b8c..037ef00 100644 --- a/tests/mep/genetics/test_gene.py +++ b/tests/mep/genetics/test_gene.py @@ -29,7 +29,7 @@ def test_basic_constant(self): # expected; only one gene and it is going to be using the first constant; gene_index = 0 - expected_eval_matrix = np.matrix([[constants[constant_index], constants[constant_index]]]) + expected_eval_matrix = np.array([[constants[constant_index], constants[constant_index]]]) # run the evaluate error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets) @@ -61,7 +61,7 @@ def test_basic_feature_gene(self): # expected; only one gene and it is going to be using the first constant; gene_index = 0 - expected_eval_matrix = np.matrix([[data_matrix[0, feature_index], data_matrix[1, feature_index]]]) + expected_eval_matrix = np.array([[data_matrix[0, feature_index], data_matrix[1, feature_index]]]) # run the evaluate error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets) @@ -94,7 +94,7 @@ def test_constant_and_feature_gene(self): data_matrix[1, feature_index] = 7. # expected; - expected_eval_matrix = np.matrix([[data_matrix[0, feature_index], data_matrix[1, feature_index]], + expected_eval_matrix = np.array([[data_matrix[0, feature_index], data_matrix[1, feature_index]], [constants[constant_index], constants[constant_index]]]) # run the evaluate @@ -127,8 +127,7 @@ def test_operator_gene_basic(self): eval_matrix[0, 0] = 2 # expected; first gene is unchanged; the 2nd one is the sum of the first with itself (i.e. 4) - expected_eval_matrix = np.matrix([[2], - [4]]) + expected_eval_matrix = np.array([[2], [4]]) # run the evaluate error = gene.evaluate(1, eval_matrix, data_matrix, constants, targets) From e0502794f454197b064466399ecdc8665c8761df Mon Sep 17 00:00:00 2001 From: pjacobs Date: Thu, 24 Oct 2019 20:00:34 +0000 Subject: [PATCH 45/51] assertEqual in test model --- mep/genetics/gene.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index 0309e09..aa6d3c3 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -1,4 +1,6 @@ import logging +from typing import Union, Callable + import numpy as np from abc import ABCMeta, abstractmethod @@ -124,7 +126,7 @@ class OperatorGene(Gene): """ # NOTE: This could be expanded to multiple addresses - def __init__(self, operation: Operator, address1: int, address2: int): + def __init__(self, operation: Union[Callable, Operator], address1: int, address2: int): """ Initialize. :param operation: a lambda or function that can be operated on two floats From b1f3244ecfe1001da3faac6433bd67402c91c9bb Mon Sep 17 00:00:00 2001 From: pjacobs Date: Fri, 25 Oct 2019 16:18:22 +0000 Subject: [PATCH 46/51] depreciated Equals to Equal --- tests/mep/genetics/test_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/mep/genetics/test_model.py b/tests/mep/genetics/test_model.py index 2859262..f42d52c 100644 --- a/tests/mep/genetics/test_model.py +++ b/tests/mep/genetics/test_model.py @@ -54,7 +54,7 @@ def function_to_learn(x1, x2): # fit the model model.fit(training_feature_matrix, training_target_vector) - self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1) + self.assertEqual(model.score(test_feature_matrix, test_target_vector), 1) def test_model_min_max(self): model = MEPModel(num_constants=2, constants_min=-1, constants_max=1, @@ -72,7 +72,7 @@ def function_to_learn(x1, x2, x3, x4): # fit the model model.fit(training_feature_matrix, training_target_vector) - self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1) + self.assertEqual(model.score(test_feature_matrix, test_target_vector), 1) def test_model_pow(self): model = MEPModel(num_constants=2, constants_min=-1, constants_max=1, @@ -90,4 +90,4 @@ def function_to_learn(x1, x2, x3, x4): # fit the model model.fit(training_feature_matrix, training_target_vector) - self.assertEquals(model.score(test_feature_matrix, test_target_vector), 1) + self.assertEqual(model.score(test_feature_matrix, test_target_vector), 1) From 9f6a424108345834c528cc44356a7f10abbdd3dd Mon Sep 17 00:00:00 2001 From: pjacobs Date: Fri, 25 Oct 2019 16:42:32 +0000 Subject: [PATCH 47/51] use arg parse --- mep/main.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/mep/main.py b/mep/main.py index 5c04b1e..d3ab6f3 100644 --- a/mep/main.py +++ b/mep/main.py @@ -1,3 +1,4 @@ +import argparse import sys import datetime as dt import json @@ -7,17 +8,15 @@ from mep.model import MEPModel if __name__ == "__main__": - if len(sys.argv) != 3: - print("ERROR: Expected usage 'python -m mep.main DATA_SET_NAME PYTHON_FILE_NAME'\n" + - " DATA_SET_NAME: The name (full path) to the data file to train on.\n" - " PYTHON_FILE_NAME: The name (full path) to the python file to write the output program.\n" - "Example: 'python -m mep.main datasets/data1.csv test.py'" - ) - sys.exit(-1) + parser = argparse.ArgumentParser( + description="Run the MEP model.\nExample: 'python -m mep.main datasets/data1.csv test.py", allow_abbrev=False) + parser.add_argument("data_set_name", help="The name (full path) to the data file to train on.") + parser.add_argument("python_file_name", help="The name (full path) to the python file to write the output program.") + args = parser.parse_args() # get the data file - data_set_name = sys.argv[1] - python_file_name = sys.argv[2] + data_set_name = args.data_set_name + python_file_name = args.python_file_name data_set = DataSet(data_set_name) # read config file @@ -46,13 +45,13 @@ model.fit(data_set.data_matrix, data_set.target) logger.info("Finished fitting the model") - # TODO: Optional? # we then convert the chromosome into a valid python program and write it out to file - with open(python_file_name, 'w') as python_file: + if python_file_name: python_program = model.to_python() - logger.debug("Write out the python program to {}".format(python_file_name)) - logger.debug(python_program) - python_file.write(python_program) + with open(python_file_name, 'w') as python_file: + logger.debug("Write out the python program to {}".format(python_file_name)) + logger.debug(python_program) + python_file.write(python_program) # TODO: Add support for classification # TODO: Add example digital circuit test From fe579aa65885f7a85b78859e004dbaf8a85ed57d Mon Sep 17 00:00:00 2001 From: pjacobs Date: Mon, 28 Oct 2019 21:12:37 +0000 Subject: [PATCH 48/51] print the pruned chromosome too --- mep/genetics/chromosome.py | 7 ++++--- mep/model.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index 5e8e37d..a38a3e5 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -196,10 +196,11 @@ def pretty_string(self, stop_at_best=True): # now show each gene on a separate line for gene_index, gene in enumerate(self.genes): - gene_str = gene.__str__() - if type(gene) == VariableGene: + gene_str = str(gene) + if isinstance(gene, VariableGene): gene_str = gene.pretty_string() - elif type(gene) == OperatorGene: + elif isinstance(gene, OperatorGene): + # TODO: why not push to the gene? gene_str = "{}(PROGRAM[{}], PROGRAM[{}])".format(gene.operation.function_name(), gene.address1, gene.address2) program += "{}:{}\n".format(gene_index, gene_str) diff --git a/mep/model.py b/mep/model.py index 52fbf7b..60b34f6 100644 --- a/mep/model.py +++ b/mep/model.py @@ -73,6 +73,8 @@ def fit(self, X: np.ndarray, y: np.ndarray): # prune out the unused genes self.best_chromosome.prune() + self.logger.debug("Pruned chromosome (pretty)\n {}".format(self.best_chromosome.pretty_string())) + def predict(self, X: np.ndarray) -> np.ndarray: """ Return the predictions for this data. From 5dfdb83c206252f6cbe48e5a27a4cd1aeecc50eb Mon Sep 17 00:00:00 2001 From: pjacobs Date: Mon, 28 Oct 2019 21:15:58 +0000 Subject: [PATCH 49/51] use isinstance(..) --- mep/genetics/chromosome.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index a38a3e5..b08642d 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -216,6 +216,9 @@ def prune(self): Trim out the unused genes. NOTE: This "breaks" the chromosomes as it is going to change how many genes are in the program. Only do this once we have finished evolving the program. """ + + # TODO: drop genes which do nothing; ex: min(x[0], x[0]) or max(x[0], x[0]) + # the best gene index is going to be the last line of the program; since the genes never reference genes # beyond it then we just proceed back to the top and remove any which haven't been referenced; we determine # this via a BFS type search @@ -238,7 +241,7 @@ def prune(self): # check the addresses on the gene if it is an operator gene = self.genes[gene_index] - if type(gene) == OperatorGene: + if isinstance(gene, OperatorGene): genes_indices_to_visit.appendleft(gene.address1) genes_indices_to_visit.appendleft(gene.address2) gene_indices_in_use.add(gene.address1) @@ -255,7 +258,7 @@ def prune(self): # TODO: This could be done in the list comprehension but it is clearer to just do another pass # re-map the address to the new index for gene in self.genes: - if type(gene) == OperatorGene: + if isinstance(gene, OperatorGene): gene.address1 = gene_indices_in_use.index(gene.address1) gene.address2 = gene_indices_in_use.index(gene.address2) @@ -295,12 +298,12 @@ def to_python(self): genes_str = "program = [0] * {}\n".format(len(self.genes)) for gene_index, gene in enumerate(self.genes): genes_str += " program[{}] = ".format(gene_index) - if type(gene) == VariableGene: + if isinstance(gene, VariableGene): if gene.is_feature: genes_str += "float(sys.argv[{}])".format(gene.index + 1) else: genes_str += "constants[{}]".format(gene.index) - elif type(gene) == OperatorGene: + elif isinstance(gene, OperatorGene): genes_str += "{}(program[{}], program[{}])".format(gene.operation.function_name(), gene.address1, gene.address2) genes_str += "\n" From dbd710e0feb86c707da063682a487b9057249d98 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Mon, 28 Oct 2019 21:44:40 +0000 Subject: [PATCH 50/51] push pretty string into the gene --- mep/genetics/chromosome.py | 9 +-------- mep/genetics/gene.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index b08642d..1762b72 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -196,14 +196,7 @@ def pretty_string(self, stop_at_best=True): # now show each gene on a separate line for gene_index, gene in enumerate(self.genes): - gene_str = str(gene) - if isinstance(gene, VariableGene): - gene_str = gene.pretty_string() - elif isinstance(gene, OperatorGene): - # TODO: why not push to the gene? - gene_str = "{}(PROGRAM[{}], PROGRAM[{}])".format(gene.operation.function_name(), - gene.address1, gene.address2) - program += "{}:{}\n".format(gene_index, gene_str) + program += "{}:{}\n".format(gene_index, gene.pretty_string()) if self.best_gene_index == gene_index and stop_at_best: return program diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index aa6d3c3..5246355 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -32,7 +32,11 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> :return: error (sum of error across the examples); modifies the eval_matrix :rtype: float """ + raise NotImplementedError() + @abstractmethod + def pretty_string(self) -> str: + raise NotImplementedError() # NOTE: Should we also add a mutate method to the gene itself? Considering that we are doing the mutation by doing # a crossover of the whole chromosome with a new random chromosome, I don't think there is any benefit. @@ -99,11 +103,10 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> def __str__(self): return "VariableGene({}, is_feature={})".format(self.index, self.is_feature) - def pretty_string(self): + def pretty_string(self) -> str: """ Pretty program string version. :return: string version - :rtype: str """ if self.is_feature: return "FEATURES[{}]".format(self.index) @@ -187,3 +190,10 @@ def __eq__(self, other): # NOTE: the operators are the same if they are of the same type return isinstance(self.operation, type(other.operation)) and self.address1 == other.address1 and self.address2 == other.address2 + + def pretty_string(self) -> str: + """ + Pretty program string version. + :return: string version + """ + return "{}(PROGRAM[{}], PROGRAM[{}])".format(self.operation.function_name(), self.address1, self.address2) From 84bf2624d5de03176d07a02eec1b383fc9589123 Mon Sep 17 00:00:00 2001 From: pjacobs Date: Tue, 29 Oct 2019 21:16:37 +0000 Subject: [PATCH 51/51] division operator --- mep/genetics/chromosome.py | 6 ++++-- mep/genetics/operator.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index 1762b72..d669a15 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -2,7 +2,7 @@ import numpy as np from collections import deque from mep.genetics.gene import Gene, VariableGene, OperatorGene -from mep.genetics.operator import AdditionOperator, MultiplicationOperator, SubtractionOperator +from mep.genetics.operator import AdditionOperator, MultiplicationOperator, SubtractionOperator, DivisionOperator from mep.genetics.operator import MinOperator, MaxOperator from random import random, randint, choice @@ -19,7 +19,9 @@ class Chromosome: MultiplicationOperator, SubtractionOperator, MinOperator, - MaxOperator] + MaxOperator, + DivisionOperator + ] def __init__(self, genes, constants): """ diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py index 5790c1a..347c07b 100644 --- a/mep/genetics/operator.py +++ b/mep/genetics/operator.py @@ -1,3 +1,6 @@ +import math +import traceback +import numpy as np from abc import ABCMeta, abstractmethod @@ -148,4 +151,32 @@ def function_python_definition(self): return """ def max_(x, y): return max(x, y) + """ + + +class DivisionOperator(Operator): + """ + Perform the division operation. + """ + + def __call__(self, *args, **kwargs): + """ + Perform the operation + """ + if len(args) != 2: + raise RuntimeError("Pow operator needs just two arguments") + x = args[0] + y = args[1] + + # looping back around division + # NOTE: this is pretty weird but + return x / y if y != 0 else 0 + + def function_name(self): + return "division_" + + def function_python_definition(self): + return """ +def division_(x, y): + return x / y if y != 0 else 0 """ \ No newline at end of file