Merge pull request #12 from jrbarron/tokenizer

cdipaolo · web-flow · commit 4b2f5a3773d0 · 2016-09-17T10:40:43.000-07:00
Add support for specifying a custom tokenizer on the NaiveBayes model
diff --git a/text/bayes.go b/text/bayes.go
@@ -156,11 +156,23 @@ type NaiveBayes struct {
 	// stream holds the datastream
 	stream <-chan base.TextDatapoint
 
+	// tokenizer is used by a model
+	// to split the input into tokens
+	tokenize Tokenizer
+
 	// Output is the io.Writer used for logging
 	// and printing. Defaults to os.Stdout.
 	Output io.Writer `json:"-"`
 }
 
+// Tokenizer accepts a sentence as input and breaks
+// it down into a slice of tokens
+type Tokenizer func(string) []string
+
+func spaceTokenizer(input string) []string {
+	return strings.Split(strings.ToLower(input), " ")
+}
+
 // concurrentMap allows concurrency-friendly map
 // access via its exported Get and Set methods
 type concurrentMap struct {
@@ -236,6 +248,7 @@ func NewNaiveBayes(stream <-chan base.TextDatapoint, classes uint8, sanitize fun
 
 		sanitize: transform.RemoveFunc(sanitize),
 		stream:   stream,
+		tokenize: spaceTokenizer,
 
 		Output: os.Stdout,
 	}
@@ -249,7 +262,7 @@ func (b *NaiveBayes) Predict(sentence string) uint8 {
 	sums := make([]float64, len(b.Count))
 
 	sentence, _, _ = transform.String(b.sanitize, sentence)
-	words := strings.Split(strings.ToLower(sentence), " ")
+	words := b.tokenize(sentence)
 	for _, word := range words {
 		w, ok := b.Words.Get(word)
 		if !ok {
@@ -300,7 +313,7 @@ func (b *NaiveBayes) Probability(sentence string) (uint8, float64) {
 	}
 
 	sentence, _, _ = transform.String(b.sanitize, sentence)
-	words := strings.Split(strings.ToLower(sentence), " ")
+	words := b.tokenize(sentence)
 	for _, word := range words {
 		w, ok := b.Words.Get(word)
 		if !ok {
@@ -353,9 +366,7 @@ func (b *NaiveBayes) OnlineLearn(errors chan<- error) {
 		if more {
 			// sanitize and break up document
 			sanitized, _, _ := transform.String(b.sanitize, point.X)
-			sanitized = strings.ToLower(sanitized)
-
-			words := strings.Split(sanitized, " ")
+			words := b.tokenize(sanitized)
 
 			C := int(point.Y)
 
@@ -425,6 +436,13 @@ func (b *NaiveBayes) UpdateSanitize(sanitize func(rune) bool) {
 	b.sanitize = transform.RemoveFunc(sanitize)
 }
 
+// UpdateTokenizer updates NaiveBayes model's tokenizer function.
+// The default implementation will convert the input to lower
+// case and split on the space character.
+func (b *NaiveBayes) UpdateTokenizer(tokenizer Tokenizer) {
+	b.tokenize = tokenizer
+}
+
 // String implements the fmt interface for clean printing. Here
 // we're using it to print the model as the equation h(θ)=...
 // where h is the perceptron hypothesis model.
diff --git a/text/bayes_test.go b/text/bayes_test.go
@@ -356,3 +356,62 @@ func TestPersistNaiveBayesShouldPass1(t *testing.T) {
 	assert.EqualValues(t, 1, class, "Class should be 1")
 	assert.True(t, p > 0.75, "There should be a greater than 75 percent chance the document is positive - Given %v", p)
 }
+
+func TestTokenizer(t *testing.T) {
+	stream := make(chan base.TextDatapoint, 100)
+	errors := make(chan error)
+
+	// This is a somewhat contrived test case since splitting on commas is
+	// probably not very useful, but it is designed to purely test the
+	// tokenizer. A more useful, but too complicated test case would be to use
+	// a tokenizer that does something like porter stemming.
+	model := NewNaiveBayes(stream, 3, func(rune) bool {
+		// do not filter out commas
+		return false
+	})
+	model.UpdateTokenizer(func(input string) []string {
+		return strings.Split(strings.ToLower(input), ",")
+	})
+
+	go model.OnlineLearn(errors)
+
+	stream <- base.TextDatapoint{
+		X: "I,love,the,city",
+		Y: 1,
+	}
+
+	stream <- base.TextDatapoint{
+		X: "I,hate,Los,Angeles",
+		Y: 0,
+	}
+
+	stream <- base.TextDatapoint{
+		X: "My,mother,is,not,a,nice,lady",
+		Y: 0,
+	}
+
+	close(stream)
+
+	for {
+		err, more := <-errors
+		if more {
+			fmt.Printf("Error passed: %v", err)
+		} else {
+			// training is done!
+			break
+		}
+	}
+
+	// now you can predict like normal
+	class := model.Predict("My,mo~~~ther,is,in,Los,Angeles") // 0
+	assert.EqualValues(t, 0, class, "Class should be 0")
+
+	// test small document classification
+	class, p := model.Probability("Mother,Los,Angeles")
+	assert.EqualValues(t, 0, class, "Class should be 0")
+	assert.True(t, p > 0.75, "There should be a greater than 75 percent chance the document is negative - Given %v", p)
+
+	class, p = model.Probability("Love,the,CiTy")
+	assert.EqualValues(t, 1, class, "Class should be 1")
+	assert.True(t, p > 0.75, "There should be a greater than 75 percent chance the document is positive - Given %v", p)
+}
diff --git a/text/tfidf.go b/text/tfidf.go
@@ -3,7 +3,6 @@ package text
 import (
 	"math"
 	"sort"
-	"strings"
 
 	"golang.org/x/text/transform"
 )
@@ -81,7 +80,7 @@ func (f Frequencies) Swap(i, j int) {
 // this is calculated
 func (t *TFIDF) TFIDF(word string, sentence string) float64 {
 	sentence, _, _ = transform.String(t.sanitize, sentence)
-	document := strings.Split(strings.ToLower(sentence), " ")
+	document := t.tokenize(sentence)
 
 	return t.TermFrequency(word, document) * t.InverseDocumentFrequency(word)
 }
@@ -96,7 +95,7 @@ func (t *TFIDF) TFIDF(word string, sentence string) float64 {
 // by importance
 func (t *TFIDF) MostImportantWords(sentence string, n int) Frequencies {
 	sentence, _, _ = transform.String(t.sanitize, sentence)
-	document := strings.Split(strings.ToLower(sentence), " ")
+	document := t.tokenize(sentence)
 
 	freq := TermFrequencies(document)
 	for i := range freq {