diff --git a/.gitignore b/.gitignore index 8419443..9eda0dc 100644 --- a/.gitignore +++ b/.gitignore @@ -92,4 +92,5 @@ ENV/ .idea/* # logs -output_logs/* \ No newline at end of file +output_logs/* +ignored/* \ No newline at end of file diff --git a/README.md b/README.md index 632143f..9486b32 100644 --- a/README.md +++ b/README.md @@ -1 +1,44 @@ -# py-mep \ No newline at end of file +# Multi Expression Programming + +This is an implmentation of the MEP algorithm defined here: + +> Oltean Mihai, D. Dumitrescu, Multi Expression Programming, Technical report, UBB. + +Based upon the C++ code [here](https://github.com/mepx/mep-basic-src). + +## Running py-mep + +Create the conda environment and source it (Linux): + +``` +conda env create -f environment.yaml +source activate py-mep-dev +``` + +Or using `pip` we could do: + +``` +virtualenv -p python3 .venv +pip install -r requirements.txt +``` + +Example, running with a dataset `python -m mep.main datasets/data1.csv test.py`. This will run a full MEP population evolution to solve the problem specified in the data CSV, determine the best chromosome, prune it, and then convert that chromosome into a functioning python program that can be run by passing in the feature inputs. Example, `python test.py 5 10`. + +An example Python program evolved to solve the addition problem of adding together two features (ex: datasets/data1.csv): +``` +import sys + +if __name__ == "__main__": + # constants + constants = [0.45084442258242485, -0.464331279636617, -0.5128830066318446] + + # now the genes + program = [0] * 3 + program[0] = float(sys.argv[2]) + program[1] = float(sys.argv[1]) + program[2] = program[0] + program[1] + + + # print out the final answer + print(program[2]) +``` \ No newline at end of file diff --git a/datasets/data1.csv b/datasets/data1.csv new file mode 100644 index 0000000..2a906b5 --- /dev/null +++ b/datasets/data1.csv @@ -0,0 +1,10 @@ +x1,x2,target +0,0,0 +1,2,3 +12,2,14 +-12,90,78 +3,4,7 +0,-1,-1 +23,0,23 +8,16,24 +-10,-15,-25 diff --git a/datasets/data2.csv b/datasets/data2.csv new file mode 100644 index 0000000..f839365 --- /dev/null +++ b/datasets/data2.csv @@ -0,0 +1,10 @@ +x1,x2,x3,target +0,0,0,0 +1,2,2,4 +12,2,4,28 +-12,90,5,-1075 +3,4,1,13 +0,-1,-10,-10 +23,0,15,15 +8,16,1,129 +-10,-15,-50,100 diff --git a/datasets/data3.csv b/datasets/data3.csv new file mode 100644 index 0000000..ae9e86e --- /dev/null +++ b/datasets/data3.csv @@ -0,0 +1,20 @@ +x1,x2,x3,target +1,2,3,0 +3,2,1,10 +2,3,1,6 +0,0,0,0 +5,4,10,19 +7,6,4,51 +-1,2,5,-2 +2,-1,6,-3 +3,4,-10,23 +5,6,9,22 +-3,-6,-6,9 +-7,4,0,53 +0,2,5,-3 +6,0,-1,37 +-8,5,0,69 +-3,-3,-3,9 +2,1,2,3 +-6,-3,2,31 +0,9,4,5 diff --git a/datasets/data4.csv b/datasets/data4.csv new file mode 100644 index 0000000..7b48557 --- /dev/null +++ b/datasets/data4.csv @@ -0,0 +1,51 @@ +x1,x2,x3,target +93.0,16.0,-85.0,-7905.0 +-49.0,-5.0,16.0,-80.0 +-41.0,-21.0,79.0,-1659.0 +27.0,81.0,-63.0,-5103.0 +-9.0,-64.0,84.0,-756.0 +81.0,21.0,5.0,405.0 +-20.0,60.0,-75.0,-4500.0 +93.0,-35.0,53.0,4929.0 +0.0,-29.0,80.0,0.0 +-19.0,4.0,-33.0,-132.0 +-88.0,15.0,91.0,1365.0 +-57.0,-100.0,28.0,-1596.0 +91.0,-45.0,-36.0,-3276.0 +-50.0,19.0,-87.0,-1653.0 +-45.0,100.0,48.0,4800.0 +-32.0,22.0,-21.0,-462.0 +59.0,49.0,-46.0,-2714.0 +-40.0,-92.0,5.0,-200.0 +32.0,59.0,-85.0,-5015.0 +-94.0,-29.0,-8.0,232.0 +-21.0,-68.0,-26.0,546.0 +-25.0,26.0,81.0,2106.0 +12.0,25.0,-85.0,-2125.0 +-11.0,40.0,-57.0,-2280.0 +51.0,73.0,7.0,511.0 +100.0,-77.0,-43.0,-4300.0 +-74.0,-35.0,21.0,-735.0 +-34.0,-90.0,-14.0,476.0 +-84.0,-2.0,8.0,-16.0 +-41.0,92.0,-7.0,-644.0 +57.0,-85.0,65.0,3705.0 +38.0,55.0,71.0,3905.0 +-20.0,17.0,-100.0,-1700.0 +-42.0,40.0,-51.0,-2040.0 +-98.0,-52.0,-84.0,4368.0 +80.0,42.0,-4.0,-320.0 +63.0,-32.0,-46.0,-2898.0 +-3.0,-76.0,-19.0,57.0 +-37.0,20.0,-76.0,-1520.0 +-92.0,-24.0,-83.0,1992.0 +-23.0,-64.0,-89.0,2047.0 +94.0,-84.0,65.0,6110.0 +-54.0,3.0,-76.0,-228.0 +59.0,-61.0,49.0,2891.0 +33.0,-51.0,-32.0,-1056.0 +30.0,-22.0,46.0,1380.0 +2.0,36.0,0.0,0.0 +4.0,87.0,-88.0,-7656.0 +61.0,-13.0,-32.0,-1952.0 +46.0,13.0,-76.0,-3496.0 diff --git a/datasets/files.txt b/datasets/files.txt new file mode 100644 index 0000000..482a2ae --- /dev/null +++ b/datasets/files.txt @@ -0,0 +1,4 @@ +data1.csv is f(x_1, x_2) = x_1 + x_2 +data2.csv is f(x_1, x_2) = x_1 * x_2 + x_3 +data3.csv is f(x_1, x_2, x_3) = x_1 * x_1 + x_2 - x_3 +data4.csv is f(x_1, x_2, x_3) = max(x_1, x_2) * x_3 \ No newline at end of file diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..61b1b1c --- /dev/null +++ b/environment.yaml @@ -0,0 +1,95 @@ +name: py-mep-dev +dependencies: +- backports=1.0=py27_0 +- backports_abc=0.5=py27_0 +- bleach=1.5.0=py27_0 +- configparser=3.5.0=py27_0 +- dbus=1.10.10=0 +- decorator=4.0.11=py27_0 +- entrypoints=0.2.2=py27_1 +- enum34=1.1.6=py27_0 +- expat=2.1.0=0 +- fontconfig=2.12.1=3 +- freetype=2.5.5=2 +- functools32=3.2.3.2=py27_0 +- get_terminal_size=1.0.0=py27_0 +- glib=2.50.2=1 +- gst-plugins-base=1.8.0=0 +- gstreamer=1.8.0=0 +- html5lib=0.999=py27_0 +- icu=54.1=0 +- ipykernel=4.6.0=py27_0 +- ipython=5.3.0=py27_0 +- ipython_genutils=0.2.0=py27_0 +- ipywidgets=6.0.0=py27_0 +- jinja2=2.9.6=py27_0 +- jpeg=9b=0 +- jsonschema=2.5.1=py27_0 +- jupyter=1.0.0=py27_3 +- jupyter_client=5.0.1=py27_0 +- jupyter_console=5.1.0=py27_0 +- jupyter_core=4.3.0=py27_0 +- libffi=3.2.1=1 +- libgcc=5.2.0=0 +- libiconv=1.14=0 +- libpng=1.6.27=0 +- libsodium=1.0.10=0 +- libxcb=1.12=1 +- libxml2=2.9.4=0 +- markupsafe=0.23=py27_2 +- mistune=0.7.4=py27_0 +- mkl=2017.0.1=0 +- nbconvert=5.1.1=py27_0 +- nbformat=4.3.0=py27_0 +- notebook=5.0.0=py27_0 +- numpy=1.12.1=py27_0 +- openssl=1.0.2k=1 +- pandas=0.19.2=np112py27_1 +- pandocfilters=1.4.1=py27_0 +- path.py=10.1=py27_0 +- pathlib2=2.2.1=py27_0 +- pcre=8.39=1 +- pexpect=4.2.1=py27_0 +- pickleshare=0.7.4=py27_0 +- pip=9.0.1=py27_1 +- prompt_toolkit=1.0.14=py27_0 +- ptyprocess=0.5.1=py27_0 +- py=1.4.32=py27_0 +- pygments=2.2.0=py27_0 +- pyqt=5.6.0=py27_2 +- pytest=3.0.7=py27_0 +- python=2.7.13=0 +- python-dateutil=2.6.0=py27_0 +- pytz=2017.2=py27_0 +- pyzmq=16.0.2=py27_0 +- qt=5.6.2=3 +- qtconsole=4.3.0=py27_0 +- readline=6.2=2 +- scandir=1.5=py27_0 +- setuptools=27.2.0=py27_0 +- simplegeneric=0.8.1=py27_1 +- singledispatch=3.4.0.3=py27_0 +- sip=4.18=py27_0 +- six=1.10.0=py27_0 +- sqlite=3.13.0=0 +- ssl_match_hostname=3.4.0.2=py27_1 +- terminado=0.6=py27_0 +- testpath=0.3=py27_0 +- tk=8.5.18=0 +- tornado=4.4.2=py27_0 +- traitlets=4.3.2=py27_0 +- wcwidth=0.1.7=py27_0 +- wheel=0.29.0=py27_0 +- widgetsnbextension=2.0.0=py27_0 +- zeromq=4.1.5=0 +- zlib=1.2.8=3 +- pip: + - backports-abc==0.5 + - backports.shutil-get-terminal-size==1.0.0 + - backports.ssl-match-hostname==3.4.0.2 + - ipython-genutils==0.2.0 + - jupyter-client==5.0.1 + - jupyter-console==5.1.0 + - jupyter-core==4.3.0 + - prompt-toolkit==1.0.14 + diff --git a/mep/config/config.json b/mep/config/config.json index d1dea47..efb51db 100644 --- a/mep/config/config.json +++ b/mep/config/config.json @@ -8,7 +8,7 @@ "mutation_probability": 0.1, "crossover_probability": 0.9, - "variables_probability": 0.4, + "feature_variables_probability": 0.4, "operators_probability": 0.5, "num_constants": 3, diff --git a/mep/dataset.py b/mep/dataset.py index 850d93a..2962c0d 100644 --- a/mep/dataset.py +++ b/mep/dataset.py @@ -1,7 +1,7 @@ import pandas as pd -class DataSet(object): +class DataSet: """ Encapsulate a data set. Feature vectors and their targets. """ @@ -13,5 +13,9 @@ def __init__(self, filename): :param filename: the filename (full path to CSV) of the data :type filename: str """ - # TODO: What about supporting other file formats? - self.data = pd.read_csv(filename) \ No newline at end of file + # we assume this in the format of feature cols and then target + self.data = pd.read_csv(filename) + + # extract out data matrix and target + self.target = self.data.target.values + self.data_matrix = self.data.drop("target", 1).values diff --git a/mep/genetics/chromosome.py b/mep/genetics/chromosome.py index 6d58da2..d669a15 100644 --- a/mep/genetics/chromosome.py +++ b/mep/genetics/chromosome.py @@ -1,9 +1,13 @@ import logging +import numpy as np +from collections import deque from mep.genetics.gene import Gene, VariableGene, OperatorGene +from mep.genetics.operator import AdditionOperator, MultiplicationOperator, SubtractionOperator, DivisionOperator +from mep.genetics.operator import MinOperator, MaxOperator from random import random, randint, choice -class Chromosome(object): +class Chromosome: """ Level above Gene. Each chromosome is a fixed number of genes and constants. We can think of a chromosome as a program where each gene is a line of code in the program. Genes can reference the result of other genes by their @@ -11,9 +15,13 @@ class Chromosome(object): """ # valid operators - operator_lambdas = [lambda a, b: a + b, # + - lambda a, b: a - b, # - - lambda a, b: a * b] # * + operators_family = [AdditionOperator, + MultiplicationOperator, + SubtractionOperator, + MinOperator, + MaxOperator, + DivisionOperator + ] def __init__(self, genes, constants): """ @@ -23,13 +31,15 @@ def __init__(self, genes, constants): :param constants: the constants :type constants: list of float """ - # self.logger = logging.getLogger(self.__class__) + self.logger = logging.getLogger(self.__class__.__name__) # core genes and constants lists self.genes = genes self.constants = constants - # TODO: track the best fitness and the associated best gene seen so far + # track the best found error and the associated gene + self.error = float('inf') + self.best_gene_index = -1 @classmethod def generate_random_chromosome(cls, num_constants, constants_min, constants_max, constants_prob, @@ -76,7 +86,7 @@ def generate_random_chromosome(cls, num_constants, constants_min, constants_max, prob = random() if prob <= operators_prob: # randomly choose valid addresses; randomly choose an operator - genes.append(OperatorGene(choice(Chromosome.operator_lambdas), + genes.append(OperatorGene(choice(Chromosome.operators_family)(), randint(0, gene_index - 1), randint(0, gene_index - 1))) elif prob <= operators_prob + feature_variable_prob: genes.append(VariableGene(randint(0, num_feature_variables - 1), is_feature=True)) @@ -86,8 +96,229 @@ def generate_random_chromosome(cls, num_constants, constants_min, constants_max, # construct and return the chromosome return Chromosome(genes, constants) + def evaluate(self, data_matrix, targets): + """ + Evaluate the various genes. + + :param data_matrix: the data matrix; rows are feature vectors; comes from the data set; it is (n, m) where "n" + is the number of examples and "m" is the number of features. + :type data_matrix: np.matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + """ + num_examples = data_matrix.shape[0] + eval_matrix = np.zeros((len(self.genes), num_examples)) + for gene_index, gene in enumerate(self.genes): + # compute the error for this gene; if it is the best we have found then update + error = gene.evaluate(gene_index, eval_matrix, data_matrix, self.constants, targets) + if error < self.error: + self.error = error + self.best_gene_index = gene_index + + def predict(self, data_matrix): + """ + Return the predictions for this data. + :param data_matrix: the sample data; matrix with (n_samples, n_features) + :type data_matrix: np.matrix + :return: the prediction for each sample; array-like (n_samples) length + :rtype: np.array + """ + # NOTE: This is almost identical to evaluate except that we are running after we have done the fit so we have + # already determined the best gene index and we just want to calculate the values; no error calc + num_examples = data_matrix.shape[0] + eval_matrix = np.zeros((len(self.genes), num_examples)) + dummy_targets = [0] * num_examples + for gene_index, gene in enumerate(self.genes): + # compute the error for this gene; if it is the best we have found then update + gene.evaluate(gene_index, eval_matrix, data_matrix, self.constants, dummy_targets) + if self.best_gene_index == gene_index: + # extract from the eval_matrix; these from this gene (line in program) for each of the examples + return eval_matrix[gene_index, :] + + def mutate(self, gene_mutation_prob, num_constants, constants_min, constants_max, constants_prob, + feature_variable_prob, num_feature_variables, num_genes, operators_prob): + """ + Mutate the chromosome. Works by going through and randomly mutating genes and then constants. + :param gene_mutation_prob: probability to mutate a given gene + :type gene_mutation_prob: float + :param num_constants: how many constants to have + :type num_constants: int + :param constants_min: the min range of the constants + :type constants_min: float + :param constants_max: the max range of the constants + :type constants_max: float + :param constants_prob: the probability that a given gene is a constant + :type constants_prob: float + :param feature_variable_prob: the probability that a given gene is a feature variable + :type feature_variable_prob: float + :param num_feature_variables: how many features we have + :type num_feature_variables: int + :param num_genes: how many genes + :type num_genes: int + :param operators_prob: the probability that a given gene is an operator + :type operators_prob: float + :return: nothing + """ + # the probabilities are all the same for generating a random chromosome; therefore let's construct + # a random chromosome and then (effectively) do a uniform crossover where a "mutate" means that we + # take the new chromosome's gene/constants + # TODO: Should we have these variables set in the chromosome then? + # TODO: maybe just pass in this random chromosome then? + random_chromosome = Chromosome.generate_random_chromosome(num_constants, constants_min, + constants_max, constants_prob, + feature_variable_prob, + num_feature_variables, num_genes, + operators_prob) + + # go through mutating genes; + for gene_index in range(len(self.genes)): + # decide if we are going to mutate this gene + if random() <= gene_mutation_prob: + # mutated; therefore grab the corresponding gene from the random chromosome + self.genes[gene_index] = random_chromosome.genes[gene_index] + + # go through mutating constants; + for constants_index in range(len(self.constants)): + # decide if we are going to mutate this gene + if random() <= gene_mutation_prob: + # mutated; therefore grab the corresponding constant from the random chromosome + self.constants[constants_index] = random_chromosome.constants[constants_index] + def __str__(self): return "Chromosome({}, {})".format(self.genes, self.constants) + def pretty_string(self, stop_at_best=True): + """ + Output in a program like format. First show the constants. Then one line per gene. + :return: the program + :rtype: str + """ + # first we show the constants + program = "CONSTANTS = [{}]\n".format(",".join([str(c) for c in self.constants])) + + # now show each gene on a separate line + for gene_index, gene in enumerate(self.genes): + program += "{}:{}\n".format(gene_index, gene.pretty_string()) + + if self.best_gene_index == gene_index and stop_at_best: + return program + + # if we want to print the full program + return program + + def prune(self): + """ + Trim out the unused genes. NOTE: This "breaks" the chromosomes as it is going to change how many genes are + in the program. Only do this once we have finished evolving the program. + """ + + # TODO: drop genes which do nothing; ex: min(x[0], x[0]) or max(x[0], x[0]) + + # the best gene index is going to be the last line of the program; since the genes never reference genes + # beyond it then we just proceed back to the top and remove any which haven't been referenced; we determine + # this via a BFS type search + + # the genes that are in use -- i.e. that will be kept; + gene_indices_in_use = set() + visited = set() + + # start from best gene index + genes_indices_to_visit = deque() + genes_indices_to_visit.appendleft(self.best_gene_index) + gene_indices_in_use.add(self.best_gene_index) + + while len(genes_indices_to_visit) > 0: + # the index to visit + gene_index = genes_indices_to_visit.pop() + + # mark as visited + visited.add(gene_index) + + # check the addresses on the gene if it is an operator + gene = self.genes[gene_index] + if isinstance(gene, OperatorGene): + genes_indices_to_visit.appendleft(gene.address1) + genes_indices_to_visit.appendleft(gene.address2) + gene_indices_in_use.add(gene.address1) + gene_indices_in_use.add(gene.address2) + self.logger.debug("At gene index {} which references {} and {}".format(gene_index, + gene.address1, gene.address2)) + + # now remove any genes that aren't used + gene_indices_in_use = list(gene_indices_in_use) + gene_indices_in_use.sort() + self.logger.debug("All gene indices in use {}".format(gene_indices_in_use)) + self.genes = [self.genes[i] for i in gene_indices_in_use] + + # TODO: This could be done in the list comprehension but it is clearer to just do another pass + # re-map the address to the new index + for gene in self.genes: + if isinstance(gene, OperatorGene): + gene.address1 = gene_indices_in_use.index(gene.address1) + gene.address2 = gene_indices_in_use.index(gene.address2) + + # the now "best gene" is just the last one + self.best_gene_index = len(self.genes) - 1 + + def to_python(self): + """ + Convert to python program string. + :return: python string program + :rtype: str + """ + # python program string + python_program = """ +import sys + +# define operator/functions +{} + +if __name__ == "__main__": + # constants + {} + + # now the genes + {} + + # print out the final answer + {} + """ + # define all the function/operators + operator_def_str = "\n".join([operator().function_python_definition() for operator in self.operators_family]) + + # constants + constants_str = "constants = {}".format(self.constants) + + # genes + genes_str = "program = [0] * {}\n".format(len(self.genes)) + for gene_index, gene in enumerate(self.genes): + genes_str += " program[{}] = ".format(gene_index) + if isinstance(gene, VariableGene): + if gene.is_feature: + genes_str += "float(sys.argv[{}])".format(gene.index + 1) + else: + genes_str += "constants[{}]".format(gene.index) + elif isinstance(gene, OperatorGene): + genes_str += "{}(program[{}], program[{}])".format(gene.operation.function_name(), + gene.address1, gene.address2) + genes_str += "\n" + + # print statement + python_program = python_program.format(operator_def_str, constants_str, genes_str, + "print(program[{}])".format(len(self.genes)-1)) + + # return it + return python_program + def __repr__(self): return self.__str__() + + def __lt__(self, other): + """ + Less-than used by sort(...) + + :param other: + :type other: Chromosome + :return: + """ + return self.error < other.error diff --git a/mep/genetics/gene.py b/mep/genetics/gene.py index a00c8bd..5246355 100644 --- a/mep/genetics/gene.py +++ b/mep/genetics/gene.py @@ -1,16 +1,19 @@ import logging +from typing import Union, Callable + import numpy as np from abc import ABCMeta, abstractmethod +from mep.genetics.operator import Operator + -class Gene(object): +class Gene(metaclass=ABCMeta): """ Lowest level of the genetic structure of MEP. Think of this as one line of code in the program. """ - __metaclass__ = ABCMeta @abstractmethod - def evaluate(self, gene_index, eval_matrix, data_matrix, constants): + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> float: """ This method will modify the eval_matrix for this gene index for each example in the data_matrix. @@ -24,32 +27,38 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants): :type data_matrix: np.matrix :param constants: the constants associated with this chromosome :type constants: list - :return: nothing; modifies the eval_matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + :return: error (sum of error across the examples); modifies the eval_matrix + :rtype: float """ + raise NotImplementedError() + @abstractmethod + def pretty_string(self) -> str: + raise NotImplementedError() -# TODO: Should we also add a mutate method to the gene itself? +# NOTE: Should we also add a mutate method to the gene itself? Considering that we are doing the mutation by doing +# a crossover of the whole chromosome with a new random chromosome, I don't think there is any benefit. -class VariableGene(object): +class VariableGene(Gene): """ This gene is simply a variable. Either a constant or one of the features in the data -- i.e. an input variable. """ - def __init__(self, index, is_feature=True): + def __init__(self, index: int, is_feature=True): """ The index into either the feature vector (if "is_feature" is True) or into the constants. :param index: the index into the vector - :type index: int :param is_feature: whether this is a feature variable or a constant - :type is_feature: bool """ # self.logger = logging.getLogger(self.__class__) self.index = index self.is_feature = is_feature - def evaluate(self, gene_index, eval_matrix, data_matrix, constants): + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> float: """ This method will modify the eval_matrix for this gene index for each example in the data_matrix. @@ -64,22 +73,46 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants): :type data_matrix: np.matrix :param constants: the constants associated with this chromosome :type constants: list - :return: nothing; modifies the eval_matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + :return: error (sum of error); modifies the eval_matrix + :rtype: float """ + # TODO: Move common logic up + # TODO: Handle classification as well as regression + # go through and set the data num_examples = eval_matrix.shape[1] + sum_of_errors = 0. for example_index in range(0, num_examples): # each column is one example in the data matrix (i.e. one feature vector) # if we are a feature variable then we look at the corresponding feature in the feature vector for this # example; otherwise (as a constant) we just go to that (independent of the example we are in) if self.is_feature: - eval_matrix[gene_index, example_index] = data_matrix[example_index, self.index] + value = data_matrix[example_index, self.index] else: - eval_matrix[gene_index, example_index] = constants[self.index] + value = constants[self.index] + # calculate error + sum_of_errors += abs(targets[example_index] - value) + + # set it in the eval matrix + eval_matrix[gene_index, example_index] = value + + return sum_of_errors def __str__(self): return "VariableGene({}, is_feature={})".format(self.index, self.is_feature) + def pretty_string(self) -> str: + """ + Pretty program string version. + :return: string version + """ + if self.is_feature: + return "FEATURES[{}]".format(self.index) + else: + return "CONSTANTS[{}]".format(self.index) + def __repr__(self): return self.__str__() @@ -89,22 +122,19 @@ def __eq__(self, other): return self.index == other.index and self.is_feature == other.is_feature -class OperatorGene(object): +class OperatorGene(Gene): """ This gene performance an operation on two addresses. The addresses are indices in the eval_matrix -- i.e. from the evaluation of other genes before this one. """ # NOTE: This could be expanded to multiple addresses - def __init__(self, operation, address1, address2): + def __init__(self, operation: Union[Callable, Operator], address1: int, address2: int): """ Initialize. :param operation: a lambda or function that can be operated on two floats - :type operation: lambda :param address1: index into the eval_matrix - :type address1: int :param address2: index into the eval_matrix - :type address2: int """ # self.logger = logging.getLogger(self.__class__) @@ -112,7 +142,7 @@ def __init__(self, operation, address1, address2): self.address1 = address1 self.address2 = address2 - def evaluate(self, gene_index, eval_matrix, data_matrix, constants): + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets) -> float: """ This method will modify the eval_matrix for this gene index for each example in the data_matrix. @@ -126,16 +156,27 @@ def evaluate(self, gene_index, eval_matrix, data_matrix, constants): :type data_matrix: np.matrix :param constants: the constants associated with this chromosome :type constants: list - :return: nothing; modifies the eval_matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + :return: error (sum of error); modifies the eval_matrix + :rtype: float + """ # go through and set the data num_examples = eval_matrix.shape[1] + sum_of_errors = 0. for example_index in range(0, num_examples): # each column is one example in the data matrix (i.e. one feature vector) # TODO: Catch errors; in particular division can be a problem - eval_matrix[gene_index, example_index] = self.operation(eval_matrix[self.address1][example_index], - eval_matrix[self.address2][example_index]) + value = self.operation(eval_matrix[self.address1][example_index], + eval_matrix[self.address2][example_index]) + # set it in the eval matrix + eval_matrix[gene_index, example_index] = value + + sum_of_errors += abs(targets[example_index] - value) + + return sum_of_errors def __str__(self): return "OperatorGene({}, {}, {})".format(self.operation, self.address1, self.address2) @@ -146,4 +187,13 @@ def __repr__(self): def __eq__(self, other): if other is None or not isinstance(other, OperatorGene): return False - return self.operation == other.operation and self.address1 == other.address1 and self.address2 == other.address2 + + # NOTE: the operators are the same if they are of the same type + return isinstance(self.operation, type(other.operation)) and self.address1 == other.address1 and self.address2 == other.address2 + + def pretty_string(self) -> str: + """ + Pretty program string version. + :return: string version + """ + return "{}(PROGRAM[{}], PROGRAM[{}])".format(self.operation.function_name(), self.address1, self.address2) diff --git a/mep/genetics/operator.py b/mep/genetics/operator.py new file mode 100644 index 0000000..347c07b --- /dev/null +++ b/mep/genetics/operator.py @@ -0,0 +1,182 @@ +import math +import traceback +import numpy as np +from abc import ABCMeta, abstractmethod + + +# TODO: add some more interesting operators; example pow(...), log(...), exp(...) +class Operator(metaclass=ABCMeta): + """ + This is more of a function than a traditional "operator" but the function could be simply using an operator + like "+", "-", etc. At it's core these are indivisible functions that take arguments (i.e. preceding genes) and + output some value. + """ + + @abstractmethod + def __call__(self, *args, **kwargs): + """ + Run the operation/function and return the result. + """ + + @abstractmethod + def function_name(self): + """ + Return the name of the function for use in the pretty print and the python program. + """ + + @abstractmethod + def function_python_definition(self): + """ + Return the python definition of the function + """ + + def __str__(self): + return self.function_name() + + def __repr__(self): + return str(self) + + +# TODO: Consolidate these into just one Operator? +class AdditionOperator(Operator): + """ + Perform addition. + """ + # NOTE: there is no need to support more than two arguments as these can be chained across multiple genes + + def __call__(self, *args, **kwargs): + """ + Perform addition. + """ + return sum(args) + + def function_name(self): + return "add" + + def function_python_definition(self): + return """ +def add(x, y): + return x + y + """ + + +class MultiplicationOperator(Operator): + """ + Perform multiplication + """ + # NOTE: there is no need to support more than two arguments as these can be chained across multiple genes + + def __call__(self, *args, **kwargs): + """ + Perform subtraction. + """ + result = 1 + for arg in args: + result *= arg + + return result + + def function_name(self): + return "multiplication" + + def function_python_definition(self): + return """ +def multiplication(x, y): + return x * y + """ + + +class SubtractionOperator(Operator): + """ + Perform subtraction. + """ + # NOTE: there is no need to support more than two arguments as these can be chained across multiple genes + + def __call__(self, *args, **kwargs): + """ + Perform subtraction. + """ + result = args[0] + for arg in args[1:]: + result -= arg + + return result + + def function_name(self): + return "subtraction" + + def function_python_definition(self): + return """ +def subtraction(x, y): + return x - y + """ + + +class MinOperator(Operator): + """ + Perform the Min operation. + """ + + def __call__(self, *args, **kwargs): + """ + Perform min + """ + return min(args) + + def function_name(self): + return "min_" + + def function_python_definition(self): + return """ +def min_(x, y): + return min(x, y) + """ + + +class MaxOperator(Operator): + """ + Perform the Max operation. + """ + + def __call__(self, *args, **kwargs): + """ + Perform max + """ + return max(args) + + def function_name(self): + return "max_" + + def function_python_definition(self): + return """ +def max_(x, y): + return max(x, y) + """ + + +class DivisionOperator(Operator): + """ + Perform the division operation. + """ + + def __call__(self, *args, **kwargs): + """ + Perform the operation + """ + if len(args) != 2: + raise RuntimeError("Pow operator needs just two arguments") + x = args[0] + y = args[1] + + # looping back around division + # NOTE: this is pretty weird but + return x / y if y != 0 else 0 + + def function_name(self): + return "division_" + + def function_python_definition(self): + return """ +def division_(x, y): + return x / y if y != 0 else 0 + """ \ No newline at end of file diff --git a/mep/genetics/population.py b/mep/genetics/population.py index 55ae601..97c4d8d 100644 --- a/mep/genetics/population.py +++ b/mep/genetics/population.py @@ -1,4 +1,178 @@ -class Population(object): +from mep.genetics.chromosome import Chromosome +import random +import copy + + +class Population: """ A collection of chromosomes. - """ \ No newline at end of file + """ + + def __init__(self, data_matrix, targets, num_constants, constants_min, constants_max, + feature_variable_prob, num_genes, num_chromosomes, operators_prob): + """ + Build a randomly constructed chromosome. + + :param data_matrix: the data matrix; rows are feature vectors; comes from the data set; it is (n, m) where "n" + is the number of examples and "m" is the number of features. + :type data_matrix: np.matrix + :param targets: the targets; equal to the number of examples (n) + :type targets: list + :param num_constants: how many constants to have + :type num_constants: int + :param constants_min: the min range of the constants + :type constants_min: float + :param constants_max: the max range of the constants + :type constants_max: float + :param feature_variable_prob: the probability that a given gene is a feature variable + :type feature_variable_prob: float + :param num_genes: how many genes + :type num_genes: int + :param num_chromosomes: how many chromosomes to use + :type num_chromosomes: int + :param operators_prob: the probability that a given gene is an operator + :type operators_prob: float + """ + # set the variables + self.data_matrix = data_matrix + self.targets = targets + self.num_constants = num_constants + self.constants_min = constants_min + self.constants_max = constants_max + self.constants_prob = 1. - operators_prob - feature_variable_prob + self.feature_variable_prob = feature_variable_prob + self.num_feature_variables = self.data_matrix.shape[1] + self.num_genes = num_genes + self.num_chromosomes = num_chromosomes + self.operators_prob = operators_prob + + # TODO: take in + self.crossover_prob = 0.9 + self.mutation_prob = 0.1 + + # the chromosomes + self.chromosomes = None + + def initialize(self): + """ + Initialize the random chromosomes. + """ + # generate the random chromosomes + self.chromosomes = [Chromosome.generate_random_chromosome(self.num_constants, self.constants_min, + self.constants_max, self.constants_prob, + self.feature_variable_prob, + self.num_feature_variables, self.num_genes, + self.operators_prob) + for _ in range(self.num_chromosomes)] + + # evaluate + # TODO: this could be done in parallel + for chromosome in self.chromosomes: + chromosome.evaluate(self.data_matrix, self.targets) + + # sort the chromosomes + self.chromosomes.sort() + + def random_tournament_selection(self, tournament_size): + """ + Randomly select (tournament_size) chromosomes and return the best one. + :param tournament_size: the size of the tournament + :type tournament_size: int + :return: the + """ + # TODO: Check for bad tournament size + best_chromosome = None + for _ in range(tournament_size): + chromosome = random.choice(self.chromosomes) + if best_chromosome is None or chromosome.error < best_chromosome.error: + best_chromosome = chromosome + + return best_chromosome + + def one_cut_point_crossover(self, parent1, parent2): + """ + Construct two offspring chromosomes from the parents. We determine the crossover point so that we + take the first genes up to that point from parent1/parent2 and then we switch. + :param parent1: one parent chromosome + :type parent1: Chromosome + :param parent2: the other parent chromosome + :type parent2: Chromosome + :return: two offsprings + :rtype: (Chromosome, Chromosome) + """ + # construct the genes and constants for the offsprings from the parents + offspring1 = Chromosome([], []) + offspring2 = Chromosome([], []) + + # determine the crossover point; + cutting_point = random.randint(0, self.num_genes) + + # TODO: copy the genes + # copy over the genes; first half and now the 2nd half (from the other chromosome) + offspring1.genes = parent1.genes[:cutting_point] + parent2.genes[cutting_point:] + offspring2.genes = parent2.genes[:cutting_point] + parent1.genes[cutting_point:] + + # same thing with the constants + cutting_point = random.randint(0, self.num_constants) + + # copy over the constants; first half and now the 2nd half + offspring1.constants = parent1.constants[:cutting_point] + parent2.constants[cutting_point:] + offspring2.constants = parent2.constants[:cutting_point] + parent1.constants[cutting_point:] + + return offspring1, offspring2 + + def next_generation(self): + """ + Advance to the next generation. + """ + for _ in range(0, len(self.chromosomes), 2): + # select parents + chromosome1 = self.random_tournament_selection(2) + chromosome2 = self.random_tournament_selection(2) + + # crossover + if random.random() < self.crossover_prob: + offspring1, offspring2 = self.one_cut_point_crossover(chromosome1, chromosome2) + else: + # offspring are copies of the parents + offspring1 = copy.copy(chromosome1) + offspring2 = copy.copy(chromosome2) + + # TODO: we could consolidate the offspring code into one method and just call it twice + # mutate (potentially) offspring; calculate error + offspring1.mutate(self.mutation_prob, self.num_constants, self.constants_min, + self.constants_max, self.constants_prob, + self.feature_variable_prob, + self.num_feature_variables, self.num_genes, + self.operators_prob) + offspring1.evaluate(self.data_matrix, self.targets) + offspring2.mutate(self.mutation_prob, self.num_constants, self.constants_min, + self.constants_max, self.constants_prob, + self.feature_variable_prob, + self.num_feature_variables, self.num_genes, + self.operators_prob) + offspring2.evaluate(self.data_matrix, self.targets) + + # replace the worst chromosome in the population; note that the chromosomes start in a sorted + # order so the one at the end has the highest error; we now insert the offspring into the list + # at their error level -- i.e. keep it in sorted order + # TODO: We should be able to do this in one loop but let's do each offspring separately as it is clearer + insert_index = -1 + for chromosome_index, chromosome in enumerate(self.chromosomes): + if offspring1.error < chromosome.error: + insert_index = chromosome_index + break + # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index + if insert_index > -1: + self.chromosomes[insert_index] = offspring1 + + # now the other offspring + insert_index = -1 + for chromosome_index, chromosome in enumerate(self.chromosomes): + if offspring2.error < chromosome.error: + insert_index = chromosome_index + break + # insert it (if it wasn't worse than all existing chromosomes) at the appropriate index + if insert_index > -1: + self.chromosomes[insert_index] = offspring2 + diff --git a/mep/main.py b/mep/main.py index b42be36..d3ab6f3 100644 --- a/mep/main.py +++ b/mep/main.py @@ -1,16 +1,28 @@ +import argparse import sys import datetime as dt import json import logging import os +from mep.dataset import DataSet +from mep.model import MEPModel if __name__ == "__main__": - # TODO: Get the data file + parser = argparse.ArgumentParser( + description="Run the MEP model.\nExample: 'python -m mep.main datasets/data1.csv test.py", allow_abbrev=False) + parser.add_argument("data_set_name", help="The name (full path) to the data file to train on.") + parser.add_argument("python_file_name", help="The name (full path) to the python file to write the output program.") + args = parser.parse_args() + + # get the data file + data_set_name = args.data_set_name + python_file_name = args.python_file_name + data_set = DataSet(data_set_name) # read config file - # TODO: Possible config file override on comand line - with open("mep/config/config.json") as data_file: - config = json.load(data_file) + # TODO: Possible config file override on command line + with open("mep/config/config.json") as config_file: + config = json.load(config_file) # construct output logs dir if it doesn't exist output_logs_dir = config["output_logs"] @@ -21,9 +33,26 @@ logging.basicConfig(filename="{}/MEP_{}.log".format(output_logs_dir, dt.datetime.now().strftime("%Y%m%d")), level=logging.DEBUG, filemode='w', - format="%(asctime)s %(name)s %(funcName)s %(levelname) %(message)s") + format="%(asctime)s %(name)s %(funcName)s %(levelname)s %(message)s") logger = logging.getLogger("main") logger.info("Starting up...") - + # configure the model; then fit it to the training data + model = MEPModel(int(config["num_constants"]), float(config["constants_min"]), float(config["constants_max"]), + float(config["feature_variables_probability"]), int(config["code_length"]), + int(config["population_size"]), float(config["operators_probability"]), + int(config["num_generations"])) + model.fit(data_set.data_matrix, data_set.target) + logger.info("Finished fitting the model") + + # we then convert the chromosome into a valid python program and write it out to file + if python_file_name: + python_program = model.to_python() + with open(python_file_name, 'w') as python_file: + logger.debug("Write out the python program to {}".format(python_file_name)) + logger.debug(python_program) + python_file.write(python_program) + # TODO: Add support for classification + # TODO: Add example digital circuit test + # TODO: Add UDFs diff --git a/mep/model.py b/mep/model.py new file mode 100644 index 0000000..60b34f6 --- /dev/null +++ b/mep/model.py @@ -0,0 +1,118 @@ +import logging +import numpy as np +from mep.genetics.population import Population + + +# NOTE: The idea is to explicitly conform to a scikit-learn type of approach where we can run fit(..) and +# predict(..) methods on the model +class MEPModel: + """ + Encapsulate the MEP model. + """ + + def __init__(self, num_constants: int, constants_min: float, constants_max: float, + feature_variables_probability: float, code_length: int, population_size: int, + operators_probability: float, num_generations: int): + + """ + Initialize. + :param num_constants: + :param constants_min: + :param constants_max: + :param feature_variables_probability: + :param code_length: + :param population_size: + :param operators_probability: + :param num_generations: + """ + # logger + self.logger = logging.getLogger(self.__class__.__name__) + + # core parameters + self.num_constants = num_constants + self.constants_min = constants_min + self.constants_max = constants_max + self.feature_variables_probability = feature_variables_probability + self.code_length = code_length + self.population_size = population_size + self.operators_probability = operators_probability + self.num_generations = num_generations + + # the best found chromosome from the evolution process + self.best_chromosome = None + + def fit(self, X: np.ndarray, y: np.ndarray): + """ + Fit the model. Given the feature vectors in matrix 'X' and the target vector 'y' we fit our model. + :param X: the feature matrix (training data) + :param y: the target values + :return: nothing + """ + # construct a population and run it for the number of generations specified + population = Population(X, y, self.num_constants, + self.constants_min, self.constants_max, + self.feature_variables_probability, + self.code_length, self.population_size, + self.operators_probability) + population.initialize() + + # iterate through the generations + for generation in range(self.num_generations): + self.best_chromosome = population.chromosomes[0] + self.logger.debug("Generation number {} best chromosome error {} with {} chromosomes ".format( + generation, self.best_chromosome.error, len(population.chromosomes))) + + if self.best_chromosome.error == 0: + self.logger.debug("Exiting early as we have hit the best possible error.") + break + population.next_generation() + + self.logger.debug("Best chromosome error {} and chromosome (pretty)\n {}".format( + self.best_chromosome.error, self.best_chromosome.pretty_string())) + + # prune out the unused genes + self.best_chromosome.prune() + + self.logger.debug("Pruned chromosome (pretty)\n {}".format(self.best_chromosome.pretty_string())) + + def predict(self, X: np.ndarray) -> np.ndarray: + """ + Return the predictions for this data. + :param X: the sample data; matrix with (n_samples, n_features) + :return: the prediction for each sample; array-like (n_samples) length + """ + return self.best_chromosome.predict(X) + + def score(self, X: np.ndarray, y: np.ndarray) -> float: + """ + Returns the coefficient of determination R^2 of the prediction. + + The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares + ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). + The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). + A constant model that always predicts the expected value of y, disregarding the input features, would get a + R^2 score of 0.0. + + (NOTE: Comment taken from scikit-learn.) + :param X: the sample data; matrix with (n_samples, n_features) + :param y: the target values + :type y: array-like, shape = (n_samples) + :return: the score + """ + y_pred = self.predict(X) + u = ((y - y_pred) ** 2).sum() + v = ((y - y.mean()) ** 2).sum() + + return 1 - u/v + + # NOTE: These are NOT scikit-learn methods now + def to_python(self): + """ + Return a python program which can run the model directly via direct inputs. + :return: the python program (string) + :rtype: str + """ + if self.best_chromosome is None: + raise ValueError("The model hasn't been fit.") + + return self.best_chromosome.to_python() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3fa8759 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy==1.17 +pandas==0.25 +pytest==5.2.1 \ No newline at end of file diff --git a/test.sh b/test.sh index be79cbe..cd7b35b 100755 --- a/test.sh +++ b/test.sh @@ -1,2 +1,2 @@ #!/bin/bash -py.test \ No newline at end of file +py.test tests/ diff --git a/tests/mep/genetics/test_chromosome.py b/tests/mep/genetics/test_chromosome.py index 8cecf9d..ba093eb 100644 --- a/tests/mep/genetics/test_chromosome.py +++ b/tests/mep/genetics/test_chromosome.py @@ -1,10 +1,26 @@ import unittest import random -from mep.genetics.gene import VariableGene, OperatorGene +from mep.genetics.gene import VariableGene, OperatorGene, Gene from mep.genetics.chromosome import Chromosome import numpy as np +class MockedGene(Gene): + def __init__(self, error_to_return): + """ + Initialize. + :param error_to_return: what to return in the evaluate + :type error_to_return: float + """ + self.error_to_return = error_to_return + + def evaluate(self, gene_index, eval_matrix, data_matrix, constants, targets): + """ + Simple mocked version. + """ + return self.error_to_return + + class TestChromosome(unittest.TestCase): """ Tests for the chromosome. @@ -27,14 +43,47 @@ def test_basic_random_construction(self): operators_prob=0.5) # confirm the number of genes and constants match what we expect - self.assertEquals(num_genes, len(chromosome.genes)) - self.assertEquals(num_constants, len(chromosome.constants)) + self.assertEqual(num_genes, len(chromosome.genes)) + self.assertEqual(num_constants, len(chromosome.constants)) # the first gene has to be a variable gene; in particular it is this one - self.assertEquals(VariableGene(0, is_feature=False), chromosome.genes[0]) + self.assertEqual(VariableGene(0, is_feature=False), chromosome.genes[0]) # the 2nd gene can be a variable or an operator; in this case it is the below - self.assertEquals(OperatorGene(Chromosome.operator_lambdas[1], 0, 0), chromosome.genes[1]) + self.assertEqual(OperatorGene(Chromosome.operators_family[4](), 0, 0), chromosome.genes[1]) # verify constant - self.assertAlmostEquals(8.599796663725433, chromosome.constants[0]) + self.assertAlmostEqual(8.599796663725433, chromosome.constants[0]) + + def test_evaluate(self): + """ + Basic test of the evaluate method. + """ + # construct mocked genes + genes = [MockedGene(10), MockedGene(1)] + + # construct chromosome + chromosome = Chromosome(genes, constants=[1, 2, 3]) + + # evaluate + chromosome.evaluate(np.zeros((2, 2)), targets=[20, 30]) + + # confirm the genes + self.assertEqual(genes[1], genes[chromosome.best_gene_index]) + self.assertEqual(genes[1].error_to_return, chromosome.error) + + def test_sort(self): + """ + Test the sort mechanism. + """ + # construct the chromosomes and test sorting them (by error) + min_chromosome, mid_chromosome, max_chromosome = Chromosome([], []), Chromosome([], []), Chromosome([], []) + min_chromosome.error = 1 + mid_chromosome.error = 2 + max_chromosome.error = 3 + chromosomes = [mid_chromosome, max_chromosome, min_chromosome] + expected_chromosomes = [min_chromosome, mid_chromosome, max_chromosome] + + # do the sort and verify + chromosomes.sort() + self.assertEqual(expected_chromosomes, chromosomes) diff --git a/tests/mep/genetics/test_gene.py b/tests/mep/genetics/test_gene.py index 40aa521..037ef00 100644 --- a/tests/mep/genetics/test_gene.py +++ b/tests/mep/genetics/test_gene.py @@ -25,14 +25,16 @@ def test_basic_constant(self): constants = [1., 2.] eval_matrix = np.zeros((num_genes, num_examples)) data_matrix = np.zeros((num_examples, num_features)) + targets = [0] * num_examples # expected; only one gene and it is going to be using the first constant; gene_index = 0 - expected_eval_matrix = np.matrix([[constants[constant_index], constants[constant_index]]]) + expected_eval_matrix = np.array([[constants[constant_index], constants[constant_index]]]) # run the evaluate - gene.evaluate(gene_index, eval_matrix, data_matrix, constants) + error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) + self.assertEqual((1. - 0) + (1. - 0), error) def test_basic_feature_gene(self): """ @@ -51,6 +53,7 @@ def test_basic_feature_gene(self): constants = [1., 2.] eval_matrix = np.zeros((num_genes, num_examples)) data_matrix = np.zeros((num_examples, num_features)) + targets = [0] * num_examples # set the data matrix for the feature that we care about data_matrix[0, feature_index] = 5. @@ -58,11 +61,12 @@ def test_basic_feature_gene(self): # expected; only one gene and it is going to be using the first constant; gene_index = 0 - expected_eval_matrix = np.matrix([[data_matrix[0, feature_index], data_matrix[1, feature_index]]]) + expected_eval_matrix = np.array([[data_matrix[0, feature_index], data_matrix[1, feature_index]]]) # run the evaluate - gene.evaluate(gene_index, eval_matrix, data_matrix, constants) + error = gene.evaluate(gene_index, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) + self.assertEqual((5. - 0.) + (7. - 0.), error) def test_constant_and_feature_gene(self): """ @@ -83,18 +87,19 @@ def test_constant_and_feature_gene(self): constants = [1., 2.] eval_matrix = np.zeros((num_genes, num_examples)) data_matrix = np.zeros((num_examples, num_features)) + targets = [0] * num_examples # set the data matrix for the feature that we care about data_matrix[0, feature_index] = 5. data_matrix[1, feature_index] = 7. # expected; - expected_eval_matrix = np.matrix([[data_matrix[0, feature_index], data_matrix[1, feature_index]], + expected_eval_matrix = np.array([[data_matrix[0, feature_index], data_matrix[1, feature_index]], [constants[constant_index], constants[constant_index]]]) # run the evaluate - feature_gene.evaluate(0, eval_matrix, data_matrix, constants) - constant_gene.evaluate(1, eval_matrix, data_matrix, constants) + feature_error = feature_gene.evaluate(0, eval_matrix, data_matrix, constants, targets) + constant_error = constant_gene.evaluate(1, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) def test_operator_gene_basic(self): @@ -111,6 +116,7 @@ def test_operator_gene_basic(self): num_examples = 1 num_genes = 2 num_features = 3 + targets = [0] * num_examples # create constants = [] @@ -121,9 +127,8 @@ def test_operator_gene_basic(self): eval_matrix[0, 0] = 2 # expected; first gene is unchanged; the 2nd one is the sum of the first with itself (i.e. 4) - expected_eval_matrix = np.matrix([[2], - [4]]) + expected_eval_matrix = np.array([[2], [4]]) # run the evaluate - gene.evaluate(1, eval_matrix, data_matrix, constants) + error = gene.evaluate(1, eval_matrix, data_matrix, constants, targets) self.assertTrue(np.array_equal(expected_eval_matrix, eval_matrix)) diff --git a/tests/mep/genetics/test_model.py b/tests/mep/genetics/test_model.py new file mode 100644 index 0000000..f42d52c --- /dev/null +++ b/tests/mep/genetics/test_model.py @@ -0,0 +1,93 @@ +import unittest +from mep.model import MEPModel +import random +import numpy as np +import logging +import datetime as dt + +# make reproducible +random.seed(1) + +logging.basicConfig(filename="output_logs/TEST_{}.log".format(dt.datetime.now().strftime("%Y%m%d")), + level=logging.DEBUG, + filemode='w', + format="%(asctime)s %(name)s %(funcName)s %(levelname)s %(message)s") +logger = logging.getLogger("main") + + +class TestModel(unittest.TestCase): + """ + Test the model. + """ + + def _generate_train_and_test(self, function_to_learn, num_samples, num_args): + training_feature_matrix = [] + training_target_vector = [] + for sample in range(num_samples): + args = [random.randint(-250, 250) for _ in range(num_args)] + training_feature_matrix.append(args) + training_target_vector.append(function_to_learn(*args)) + + test_feature_matrix = [] + test_target_vector = [] + for sample in range(num_samples): + args = [random.randint(-250, 250) for _ in range(num_args)] + test_feature_matrix.append(args) + test_target_vector.append(function_to_learn(*args)) + + return np.matrix(training_feature_matrix), np.array(training_target_vector), \ + np.matrix(test_feature_matrix), np.array(test_target_vector) + + def test_model_basic(self): + model = MEPModel(num_constants=2, constants_min=-1, constants_max=1, + feature_variables_probability=0.4, code_length=50, + population_size=100, operators_probability=0.5, + num_generations=200) + + # generate data from this function + def function_to_learn(x1, x2): + return x1 + x2 + + training_feature_matrix, training_target_vector, test_feature_matrix, test_target_vector = self._generate_train_and_test( + function_to_learn, 100, 2) + + # fit the model + model.fit(training_feature_matrix, training_target_vector) + + self.assertEqual(model.score(test_feature_matrix, test_target_vector), 1) + + def test_model_min_max(self): + model = MEPModel(num_constants=2, constants_min=-1, constants_max=1, + feature_variables_probability=0.4, code_length=50, + population_size=100, operators_probability=0.7, + num_generations=200) + + # generate data from this function + def function_to_learn(x1, x2, x3, x4): + return min(x1, x2) + max(x3, x4) + + training_feature_matrix, training_target_vector, test_feature_matrix, test_target_vector = self._generate_train_and_test( + function_to_learn, 100, 4) + + # fit the model + model.fit(training_feature_matrix, training_target_vector) + + self.assertEqual(model.score(test_feature_matrix, test_target_vector), 1) + + def test_model_pow(self): + model = MEPModel(num_constants=2, constants_min=-1, constants_max=1, + feature_variables_probability=0.4, code_length=50, + population_size=100, operators_probability=0.5, + num_generations=200) + + # generate data from this function + def function_to_learn(x1, x2, x3, x4): + return x1 * x2 + x2 * x2 + x3 + + training_feature_matrix, training_target_vector, test_feature_matrix, test_target_vector = self._generate_train_and_test( + function_to_learn, 100, 4) + + # fit the model + model.fit(training_feature_matrix, training_target_vector) + + self.assertEqual(model.score(test_feature_matrix, test_target_vector), 1) diff --git a/tests/mep/genetics/test_operator.py b/tests/mep/genetics/test_operator.py new file mode 100644 index 0000000..925a715 --- /dev/null +++ b/tests/mep/genetics/test_operator.py @@ -0,0 +1,69 @@ +import unittest +from mep.genetics.operator import MultiplicationOperator, AdditionOperator, SubtractionOperator +from mep.genetics.operator import MinOperator, MaxOperator + + +class TestOperators(unittest.TestCase): + """ + Test the Operator classes + """ + + def test_multiplication_operator(self): + """ + """ + # construct the operator and test it + operator = MultiplicationOperator() + self.assertEquals(5 * 2, operator(5, 2)) + self.assertEquals("multiplication", operator.function_name()) + self.assertEquals(""" +def multiplication(x, y): + return x * y + """, operator.function_python_definition()) + + def test_addition_operator(self): + """ + """ + # construct the operator and test it + operator = AdditionOperator() + self.assertEquals(5 + 2, operator(5, 2)) + self.assertEquals("add", operator.function_name()) + self.assertEquals(""" +def add(x, y): + return x + y + """, operator.function_python_definition()) + + def test_subtraction_operator(self): + """ + """ + # construct the operator and test it + operator = SubtractionOperator() + self.assertEquals(5 - 2, operator(5, 2)) + self.assertEquals("subtraction", operator.function_name()) + self.assertEquals(""" +def subtraction(x, y): + return x - y + """, operator.function_python_definition()) + + def test_min_operator(self): + """ + """ + # construct the operator and test it + operator = MinOperator() + self.assertEquals(min(5, 2), operator(5, 2)) + self.assertEquals("min_", operator.function_name()) + self.assertEquals(""" +def min_(x, y): + return min(x, y) + """, operator.function_python_definition()) + + def test_max_operator(self): + """ + """ + # construct the operator and test it + operator = MaxOperator() + self.assertEquals(max(5, 2), operator(5, 2)) + self.assertEquals("max_", operator.function_name()) + self.assertEquals(""" +def max_(x, y): + return max(x, y) + """, operator.function_python_definition()) \ No newline at end of file diff --git a/tests/mep/genetics/test_population.py b/tests/mep/genetics/test_population.py new file mode 100644 index 0000000..78892b6 --- /dev/null +++ b/tests/mep/genetics/test_population.py @@ -0,0 +1,52 @@ +import unittest +import random +import numpy as np +from mep.genetics.population import Population +from mep.genetics.chromosome import Chromosome + + +class TestPopulation(unittest.TestCase): + """ + Test the Population class. + """ + + def test_random_tournament_selection(self): + """ + Test the random_tournament_selection(...) + """ + # make it so this repeatable + random.seed(0) + + # construct the population + num_examples = 5 + num_features = 7 + population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1) + + # confirm the number of feature variables (not critical for this test) + self.assertEqual(num_features, population.num_feature_variables) + + # test the tournament selection; not that it randomly chooses the not as good chromosome + min_chromosome, max_chromosome = Chromosome([], []), Chromosome([], []) + min_chromosome.error = 1 + max_chromosome.error = 2 + population.chromosomes = [min_chromosome, max_chromosome] + self.assertEqual(max_chromosome, population.random_tournament_selection(1)) + + def test_larger_random_tournament_selection(self): + """ + Test the random_tournament_selection(...) + """ + # make it so this repeatable + random.seed(0) + + # construct the population + num_examples = 5 + num_features = 7 + population = Population(np.zeros((num_examples, num_features)), [], 1, 1, 1, 1, 1, 1, 1) + + # test the tournament selection; not that it randomly chooses the not as good chromosome + min_chromosome, max_chromosome = Chromosome([], []), Chromosome([], []) + min_chromosome.error = 1 + max_chromosome.error = 2 + population.chromosomes = [min_chromosome, max_chromosome] + self.assertEqual(min_chromosome, population.random_tournament_selection(10)) \ No newline at end of file