Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,5 @@ venv.bak/
.vscode
venv-tf/*
.pytype/
mkdocs/site
mkdocs/site
node_modules
7 changes: 6 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ env:
global:
- COVERALLS_PARALLEL=true
matrix:
# Scoring
- TEST_FILE=tests/scoring
# Labeling
- TEST_FILE=tests/labeling/
# classification part 1
Expand All @@ -17,6 +19,7 @@ env:
- TEST_FILE=tests/test_custom_multi_output_classification.py
# Embedding
- TEST_FILE=tests/embedding/
# Tokenizer
- TEST_FILE=tests/test_tokenizer.py

python:
Expand Down Expand Up @@ -44,6 +47,7 @@ install:
- pip install nose
- python -c "import kashgari;print(f'kashgari version {kashgari.__version__}')"
- git fetch --unshallow --quiet
- export PYTHONPATH=`pwd`

script: nosetests --with-coverage --cover-html --cover-html-dir=htmlcov
--cover-xml --cover-xml-file=coverage.xml --with-xunit
Expand All @@ -68,8 +72,9 @@ jobs:
- stage: Document
python: "3.6"
install:
- echo -e "machine github.com\n login ${GITHUB_TOKEN}" > ~/.netrc
- echo -e "machine github.com\n login ${GITHUB_TOKEN}" > ~/.netrc
- pip install mkdocs mkdocs-material pymdown-extensions
script:
- cp README.md mkdocs/docs/index.md
- cd mkdocs
- mkdocs gh-deploy --force --clean
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ Here is a set of quick tutorials to get you started with the library:

- [Tutorial 1: Text Classification](https://kashgari.bmio.net/tutorial/text-classification/)
- [Tutorial 2: Text Labeling](https://kashgari.bmio.net/tutorial/text-labeling/)
- [Tutorial 3: Language Embedding](https://kashgari.bmio.net/embeddings/)
- [Tutorial 3: Text Scoring](https://kashgari.bmio.net/tutorial/text-scoring/)
- [Tutorial 4: Language Embedding](https://kashgari.bmio.net/embeddings/)

There are also articles and posts that illustrate how to use Kashgari:

Expand Down
1 change: 1 addition & 0 deletions kashgari/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
custom_objects = keras_bert.get_custom_objects()
CLASSIFICATION = TaskType.CLASSIFICATION
LABELING = TaskType.LABELING
SCORING = TaskType.SCORING

from kashgari.version import __version__

Expand Down
6 changes: 4 additions & 2 deletions kashgari/embeddings/base_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from tensorflow import keras

import kashgari
from kashgari.processors import ClassificationProcessor, LabelingProcessor
from kashgari.processors import ClassificationProcessor, LabelingProcessor, ScoringProcessor
from kashgari.processors.base_processor import BaseProcessor

L = keras.layers
Expand Down Expand Up @@ -74,8 +74,10 @@ def __init__(self,
self.processor = ClassificationProcessor()
elif task == kashgari.LABELING:
self.processor = LabelingProcessor()
elif task == kashgari.SCORING:
self.processor = ScoringProcessor()
else:
raise ValueError()
raise ValueError('Need to set the processor param, value: {labeling, classification, scoring}')
else:
self.processor = processor

Expand Down
1 change: 1 addition & 0 deletions kashgari/macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
class TaskType(object):
CLASSIFICATION = 'classification'
LABELING = 'labeling'
SCORING = 'scoring'


class Config(object):
Expand Down
1 change: 1 addition & 0 deletions kashgari/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@

from kashgari.processors.classification_processor import ClassificationProcessor
from kashgari.processors.labeling_processor import LabelingProcessor
from kashgari.processors.scoring_processor import ScoringProcessor
100 changes: 100 additions & 0 deletions kashgari/processors/scoring_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# encoding: utf-8

# author: BrikerMan
# contact: [email protected]
# blog: https://eliyar.biz

# file: scoring_processor.py
# time: 11:10 上午

from typing import List, Optional

import numpy as np

import kashgari
from kashgari import utils
from kashgari.processors.base_processor import BaseProcessor


def is_numeric(obj):
attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
return all(hasattr(obj, attr) for attr in attrs)


class ScoringProcessor(BaseProcessor):
"""
Corpus Pre Processor class
"""

def __init__(self, output_dim=None, **kwargs):
super(ScoringProcessor, self).__init__(**kwargs)
self.output_dim = output_dim

def info(self):
info = super(ScoringProcessor, self).info()
info['task'] = kashgari.SCORING
return info

def _build_label_dict(self,
label_list: List[List[float]]):
"""
Build label2idx dict for sequence labeling task

Args:
label_list: corpus label list
"""
if self.output_dim is None:
label_sample = label_list[0]
if isinstance(label_sample, np.ndarray) and len(label_sample.shape) == 1:
self.output_dim = label_sample.shape[0]
elif is_numeric(label_sample):
self.output_dim = 1
elif isinstance(label_sample, list):
self.output_dim = len(label_sample)
else:
raise ValueError('Scoring Label Sample must be a float, float array or 1D numpy array')
# np_labels = np.array(label_list)
# if np_labels.max() > 1 or np_labels.min() < 0:
# raise ValueError('Scoring Label Sample must be in range[0,1]')

def process_y_dataset(self,
data: List[List[str]],
max_len: Optional[int] = None,
subset: Optional[List[int]] = None) -> np.ndarray:
if subset is not None:
target = utils.get_list_subset(data, subset)
else:
target = data[:]
y = np.array(target)
return y

def numerize_token_sequences(self,
sequences: List[List[str]]):

result = []
for seq in sequences:
if self.add_bos_eos:
seq = [self.token_bos] + seq + [self.token_eos]
unk_index = self.token2idx[self.token_unk]
result.append([self.token2idx.get(token, unk_index) for token in seq])
return result

def numerize_label_sequences(self,
sequences: List[List[str]]) -> List[List[int]]:
return sequences

def reverse_numerize_label_sequences(self,
sequences,
lengths=None):
return sequences


if __name__ == "__main__":
from kashgari.corpus import SMP2018ECDTCorpus

x, y = SMP2018ECDTCorpus.load_data()
x = x[:3]
y = [0.2, 0.3, 0.2]
p = ScoringProcessor()
p.analyze_corpus(x, y)
print(p.process_y_dataset(y))
12 changes: 8 additions & 4 deletions kashgari/tasks/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,12 +414,16 @@ def predict(self,
lengths = [len(sen) for sen in x_data]
tensor = self.embedding.process_x_dataset(x_data)
pred = self.tf_model.predict(tensor, batch_size=batch_size, **predict_kwargs)
res = self.embedding.reverse_numerize_label_sequences(pred.argmax(-1),
if self.task == 'scoring':
t_pred = pred
else:
t_pred = pred.argmax(-1)
res = self.embedding.reverse_numerize_label_sequences(t_pred,
lengths)
if debug_info:
logging.info('input: {}'.format(tensor))
logging.info('output: {}'.format(pred))
logging.info('output argmax: {}'.format(pred.argmax(-1)))
print('input: {}'.format(tensor))
print('output: {}'.format(pred))
print('output argmax: {}'.format(t_pred))
return res

def evaluate(self,
Expand Down
14 changes: 14 additions & 0 deletions kashgari/tasks/scoring/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# encoding: utf-8

# author: BrikerMan
# contact: [email protected]
# blog: https://eliyar.biz

# file: __init__.py
# time: 11:36 上午


from kashgari.tasks.scoring.models import BiLSTM_Model

if __name__ == "__main__":
pass
92 changes: 92 additions & 0 deletions kashgari/tasks/scoring/base_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# encoding: utf-8

# author: BrikerMan
# contact: [email protected]
# blog: https://eliyar.biz

# file: base_model.py
# time: 11:36 上午


from typing import Callable
from typing import Dict, Any

import numpy as np
from sklearn import metrics

from kashgari.tasks.base_model import BaseModel


class BaseScoringModel(BaseModel):
"""Base Sequence Labeling Model"""

__task__ = 'scoring'

@classmethod
def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
raise NotImplementedError

def compile_model(self, **kwargs):
if kwargs.get('loss') is None:
kwargs['loss'] = 'mse'
if kwargs.get('optimizer') is None:
kwargs['optimizer'] = 'rmsprop'
if kwargs.get('metrics') is None:
kwargs['metrics'] = ['mae']
super(BaseScoringModel, self).compile_model(**kwargs)

def evaluate(self,
x_data,
y_data,
batch_size=None,
should_round: bool = False,
round_func: Callable = None,
digits=4,
debug_info=False) -> Dict:
"""
Build a text report showing the main classification metrics.

Args:
x_data:
y_data:
batch_size:
should_round:
round_func:
digits:
debug_info:

Returns:

"""
y_pred = self.predict(x_data, batch_size=batch_size)

if should_round:
if round_func is None:
round_func = np.round
print(self.processor.output_dim)
if self.processor.output_dim != 1:
raise ValueError('Evaluate with round function only accept 1D output')
y_pred = [round_func(i) for i in y_pred]
report = metrics.classification_report(y_data,
y_pred,
digits=digits)

report_dic = metrics.classification_report(y_data,
y_pred,
output_dict=True,
digits=digits)
print(report)
else:
mean_squared_error = metrics.mean_squared_error(y_data, y_pred)
r2_score = metrics.r2_score(y_data, y_pred)
report_dic = {
'mean_squared_error': mean_squared_error,
'r2_score': r2_score
}
print(f"mean_squared_error : {mean_squared_error}\n"
f"r2_score : {r2_score}")
return report_dic


if __name__ == "__main__":
pass
57 changes: 57 additions & 0 deletions kashgari/tasks/scoring/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# encoding: utf-8

# author: BrikerMan
# contact: [email protected]
# blog: https://eliyar.biz

# file: models.py
# time: 11:38 上午


import logging
from typing import Dict, Any

from tensorflow import keras

from kashgari.tasks.scoring.base_model import BaseScoringModel
from kashgari.layers import L


class BiLSTM_Model(BaseScoringModel):

@classmethod
def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
return {
'layer_bi_lstm': {
'units': 128,
'return_sequences': False
},
'layer_dense': {
'activation': 'linear'
}
}

def build_model_arc(self):
output_dim = self.processor.output_dim
config = self.hyper_parameters
embed_model = self.embedding.embed_model

layer_bi_lstm = L.Bidirectional(L.LSTM(**config['layer_bi_lstm']))
layer_dense = L.Dense(output_dim, **config['layer_dense'])

tensor = layer_bi_lstm(embed_model.output)
output_tensor = layer_dense(tensor)

self.tf_model = keras.Model(embed_model.inputs, output_tensor)


if __name__ == "__main__":
from kashgari.corpus import SMP2018ECDTCorpus
import numpy as np

x, y = SMP2018ECDTCorpus.load_data('valid')
y = np.random.random((len(x), 4))
model = BiLSTM_Model()
model.fit(x, y)
print(model.predict(x[:10]))

Loading