diff --git a/.cloudbuild/Dockerfile b/.cloudbuild/Dockerfile deleted file mode 100644 index 456a354c84..0000000000 --- a/.cloudbuild/Dockerfile +++ /dev/null @@ -1,4 +0,0 @@ -ARG IMAGE_NAME -FROM $IMAGE_NAME -COPY . /kerasnlp -WORKDIR /kerasnlp diff --git a/.cloudbuild/README.md b/.cloudbuild/README.md deleted file mode 100644 index 064caf5f33..0000000000 --- a/.cloudbuild/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# KerasNLP Accelerators Testing - -This `cloudbuild/` directory contains configurations for accelerators (GPU/TPU) -testing. Briefly, for each PR, it copies the PR's code to a base docker image -which contains KerasNLP dependencies to make a new docker image, and deploys the -new image to Google Kubernetes Engine cluster, then run all tests in -`keras_nlp/` via Google Cloud Build. - -- `cloudbuild.yaml`: The cloud build configuration that specifies steps to run - by cloud build. -- `Dockerfile`: The configuration to build the docker image for deployment. -- `requirements.txt`: Dependencies of KerasNLP. -- `unit_test_jobs.jsonnet`: Jsonnet config that tells GKE cluster to run all - unit tests in `keras_nlp/`. - -This test is powered by [ml-testing-accelerators](https://github.com/GoogleCloudPlatform/ml-testing-accelerators). - -### Adding Test Dependencies - -You must be authorized to run builds in the `keras-team-test` GCP project. -If you are not, please open a GitHub issue and ping a team member. -To authorize yourself with `keras-team-test`, run: - -```bash -gcloud config set project keras-team-test -``` - -To add/update dependency for GPU tests for a given framework: -- Add/update dependencies in `requirements.txt`. -- Add/update dependencies in `.cloudbuild/{framework}/Dockerfile`. -- Run the following: -``` -gcloud builds submit --region=us-west1 --tag us-west1-docker.pkg.dev/keras-team-test/keras-nlp-test/keras-nlp-image-{framework}:deps --timeout=30m -``` - -Alternately, to update all docker images at once, just run: -``` -./cloudbuild/update_images.sh -``` - -### Run TPU Testing - -Because of the TPU capacity limit, we cannot set automatic TPU testing. To -trigger the TPU testing, run the following command: - -``` -gcloud builds submit --config .cloudbuild/tpu_cloudbuild.yaml . --project=keras-team-test -``` diff --git a/.cloudbuild/cloudbuild.yaml b/.cloudbuild/cloudbuild.yaml deleted file mode 100644 index 474cf0e32a..0000000000 --- a/.cloudbuild/cloudbuild.yaml +++ /dev/null @@ -1,77 +0,0 @@ -substitutions: - # GCS bucket name. - _GCS_BUCKET: 'gs://keras-nlp-github-test' - # GKE cluster name. - _CLUSTER_NAME: 'keras-nlp-test-cluster' - # Location of GKE cluster. - _CLUSTER_ZONE: 'us-west1-b' - # Image name. - _IMAGE_NAME: 'us-west1-docker.pkg.dev/keras-team-test/keras-nlp-test/keras-nlp-image-${_BACKEND}' -steps: -- name: 'gcr.io/cloud-builders/docker' - id: build-image - entrypoint: 'bash' - args: - ['-c', 'docker build -f .cloudbuild/Dockerfile -t $_IMAGE_NAME:$BUILD_ID --build-arg IMAGE_NAME=$_IMAGE_NAME:deps .'] -- name: 'gcr.io/cloud-builders/docker' - id: push-image - waitFor: - - build-image - args: ['push', '$_IMAGE_NAME:$BUILD_ID'] -- name: 'golang' - id: download-jsonnet - waitFor: ['-'] - entrypoint: 'go' - args: [ - 'install', - 'github.com/google/go-jsonnet/cmd/jsonnet@latest', - ] -- name: 'gcr.io/cloud-builders/gcloud' - id: clone-templates - waitFor: ['-'] - entrypoint: 'git' - args: [ - 'clone', - 'https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git', - ] -- name: 'golang' - id: build-templates - waitFor: - - download-jsonnet - - clone-templates - entrypoint: 'jsonnet' - args: [ - '.cloudbuild/unit_test_jobs.jsonnet', - '--string', - '-J', 'ml-testing-accelerators', - '--ext-str', 'image=$_IMAGE_NAME', - '--ext-str', 'tag_name=$BUILD_ID', - '--ext-str', 'gcs_bucket=$_GCS_BUCKET', - '--ext-str', 'backend=$_BACKEND', - '-o', 'output.yaml', - ] -- name: 'gcr.io/cloud-builders/gcloud' - id: create-job - waitFor: - - push-image - - build-templates - entrypoint: bash - args: - - -c - - | - set -u - set -e - set -x - gcloud container clusters get-credentials $_CLUSTER_NAME --zone $_CLUSTER_ZONE --project keras-team-test - job_name=$(kubectl create -f output.yaml -o name) - sleep 5 - pod_name=$(kubectl wait --for condition=ready --timeout=120m pod -l job-name=${job_name#job.batch/} -o name) - kubectl logs -f $pod_name --container=train - sleep 5 - gcloud artifacts docker images delete $_IMAGE_NAME:$BUILD_ID - exit $(kubectl get $pod_name -o jsonpath={.status.containerStatuses[0].state.terminated.exitCode}) -timeout: 120m -options: - volumes: - - name: go-modules - path: /go diff --git a/.cloudbuild/cloudbuild_tpu.yaml b/.cloudbuild/cloudbuild_tpu.yaml deleted file mode 100644 index c715d71fb7..0000000000 --- a/.cloudbuild/cloudbuild_tpu.yaml +++ /dev/null @@ -1,79 +0,0 @@ -substitutions: - # GCS bucket name. - _GCS_BUCKET: 'gs://keras-nlp-github-test' - # GKE cluster name. - _CLUSTER_NAME: 'keras-nlp-tpu-test-cluster' - # Location of GKE cluster. - _CLUSTER_ZONE: 'us-central1-a' - # Image name. - _IMAGE_NAME: 'us-west1-docker.pkg.dev/keras-team-test/keras-nlp-test/keras-nlp-image' -steps: -- name: 'docker' - id: build-image - args: [ - 'build', - '.', - '-f', '.cloudbuild/Dockerfile', - '-t', '$_IMAGE_NAME:$BUILD_ID', - ] -- name: 'docker' - id: push-image - waitFor: - - build-image - args: ['push', '$_IMAGE_NAME:$BUILD_ID'] -- name: 'golang' - id: download-jsonnet - waitFor: ['-'] - entrypoint: 'go' - args: [ - 'install', - 'github.com/google/go-jsonnet/cmd/jsonnet@latest', - ] -- name: 'gcr.io/cloud-builders/gcloud' - id: clone-templates - waitFor: ['-'] - entrypoint: 'git' - args: [ - 'clone', - 'https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git', - ] -- name: 'golang' - id: build-templates - waitFor: - - download-jsonnet - - clone-templates - entrypoint: 'jsonnet' - args: [ - '.cloudbuild/unit_test_jobs_tpu.jsonnet', - '--string', - '-J', 'ml-testing-accelerators', - '--ext-str', 'image=$_IMAGE_NAME', - '--ext-str', 'tag_name=$BUILD_ID', - '--ext-str', 'gcs_bucket=$_GCS_BUCKET', - '-o', 'output.yaml', - ] -- name: 'gcr.io/cloud-builders/gcloud' - id: create-job - waitFor: - - push-image - - build-templates - entrypoint: bash - args: - - -c - - | - set -u - set -e - set -x - gcloud container clusters get-credentials $_CLUSTER_NAME --zone $_CLUSTER_ZONE --project keras-team-test - job_name=$(kubectl create -f output.yaml -o name) - sleep 5 - pod_name=$(kubectl wait --for condition=ready --timeout=120m pod -l job-name=${job_name#job.batch/} -o name) - kubectl logs -f $pod_name --container=train - sleep 5 - # gcloud artifacts docker images delete $_IMAGE_NAME:$BUILD_ID - exit $(kubectl get $pod_name -o jsonpath={.status.containerStatuses[0].state.terminated.exitCode}) -timeout: 120m -options: - volumes: - - name: go-modules - path: /go diff --git a/.cloudbuild/jax/Dockerfile b/.cloudbuild/jax/Dockerfile deleted file mode 100644 index ec84817cc2..0000000000 --- a/.cloudbuild/jax/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -FROM nvidia/cuda:11.7.1-base-ubuntu20.04 -RUN apt-get update -RUN apt-get install -y python3 python3-pip -RUN apt-get install -y git -RUN git clone https://github.com/keras-team/keras-nlp.git -RUN cd keras-nlp -RUN pip install -r keras-nlp/requirements.txt -RUN pip install --upgrade "jax[cuda11_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html diff --git a/.cloudbuild/tensorflow/Dockerfile b/.cloudbuild/tensorflow/Dockerfile deleted file mode 100644 index d452d3761b..0000000000 --- a/.cloudbuild/tensorflow/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM tensorflow/tensorflow:2.13.0-gpu -RUN apt-get -y update -RUN apt-get -y install git -RUN git clone https://github.com/keras-team/keras-nlp.git -RUN cd keras-nlp -RUN pip install -r keras-nlp/requirements.txt diff --git a/.cloudbuild/torch/Dockerfile b/.cloudbuild/torch/Dockerfile deleted file mode 100644 index ecd88b81a3..0000000000 --- a/.cloudbuild/torch/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -FROM nvidia/cuda:11.7.1-base-ubuntu20.04 -RUN apt-get update -RUN apt-get install -y python3 python3-pip -RUN apt-get install -y git -RUN git clone https://github.com/keras-team/keras-nlp.git -RUN cd keras-nlp -RUN pip install -r keras-nlp/requirements.txt -RUN pip install torch diff --git a/.cloudbuild/unit_test_jobs.jsonnet b/.cloudbuild/unit_test_jobs.jsonnet deleted file mode 100644 index 560581c2ef..0000000000 --- a/.cloudbuild/unit_test_jobs.jsonnet +++ /dev/null @@ -1,43 +0,0 @@ -local base = import 'templates/base.libsonnet'; -local gpus = import 'templates/gpus.libsonnet'; - -local image = std.extVar('image'); -local tagName = std.extVar('tag_name'); -local gcsBucket = std.extVar('gcs_bucket'); -local backend = std.extVar('backend'); - -local unittest = base.BaseTest { - // Configure job name. - frameworkPrefix: backend, - modelName: "keras-nlp", - mode: "unit-tests", - timeout: 7200, # 2 hours, in seconds - - // Set up runtime environment. - image: image, - imageTag: tagName, - accelerator: gpus.teslaT4, - outputBucket: gcsBucket, - - entrypoint: [ - 'bash', - '-c', - std.format( - ||| - export KERAS_BACKEND=%s - - # Run whatever is in `command` here. - cd keras-nlp - ${@:0} - |||, - backend - ) - ], - command: [ - 'pytest', - 'keras_nlp', - '--run_large', - ], -}; - -std.manifestYamlDoc(unittest.oneshotJob, quote_keys=false) diff --git a/.cloudbuild/unit_test_jobs_tpu.jsonnet b/.cloudbuild/unit_test_jobs_tpu.jsonnet deleted file mode 100644 index e429da40cc..0000000000 --- a/.cloudbuild/unit_test_jobs_tpu.jsonnet +++ /dev/null @@ -1,42 +0,0 @@ -local base = import 'templates/base.libsonnet'; -local tpus = import 'templates/tpus.libsonnet'; - -local image = std.extVar('image'); -local tagName = std.extVar('tag_name'); -local gcsBucket = std.extVar('gcs_bucket'); - -local unittest = base.BaseTest { - // Configure job name. - frameworkPrefix: "tf", - modelName: "keras-nlp", - mode: "unit-tests", - timeout: 7200, # 2 hours, in seconds - - // Set up runtime environment. - image: image, - imageTag: tagName, - accelerator: tpus.v3_8, - outputBucket: gcsBucket, - tpuSettings+: { - softwareVersion: '2.10.0', - }, - - entrypoint: [ - 'bash', - '-c', - ||| - # Run whatever is in `command` here. - cd keras-nlp - ${@:0} - ||| - ], - command: [ - 'pytest', - '-m', - 'tpu', - 'keras_nlp', - '--run_tpu', - ], -}; - -std.manifestYamlDoc(unittest.oneshotJob, quote_keys=false) diff --git a/.cloudbuild/update_images.sh b/.cloudbuild/update_images.sh deleted file mode 100755 index 2876df81e8..0000000000 --- a/.cloudbuild/update_images.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -ex - -base_dir=$(dirname $0) - -for platform in "jax" "tensorflow" "torch"; do - pushd "${base_dir}/${platform}" > /dev/null - gcloud builds submit \ - --region=us-west1 \ - --project=keras-team-test \ - --tag "us-west1-docker.pkg.dev/keras-team-test/keras-nlp-test/keras-nlp-image-${platform}:deps" \ - --timeout=30m - popd -done diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index 926d6795a0..64a41ca16e 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -3,14 +3,19 @@ name: Tests on: push: pull_request: + workflow_call: release: types: [created] + +permissions: + contents: read + jobs: - build: - name: Test the code with tf.keras + keras_2: + name: Test the code with Keras 2 runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python 3.9 uses: actions/setup-python@v1 with: @@ -29,8 +34,8 @@ jobs: ${{ runner.os }}-pip- - name: Install dependencies run: | - pip install -r requirements.txt --progress-bar off - pip install jax[cpu] --progress-bar off + pip install -r requirements-common.txt --progress-bar off + pip install tensorflow-text==2.14 tensorflow==2.14 keras-core pip install --no-deps -e "." --progress-bar off - name: Test with pytest run: | @@ -38,15 +43,15 @@ jobs: - name: Run integration tests run: | python pip_build.py --install && cd integration_tests && pytest . - multibackend: - name: Test the code with Keras Core + keras_3: + name: Test the code with Keras 3 strategy: fail-fast: false matrix: backend: [tensorflow, jax, torch] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python 3.9 uses: actions/setup-python@v1 with: @@ -66,19 +71,17 @@ jobs: - name: Install dependencies run: | pip install -r requirements.txt --progress-bar off - pip install torch>=2.0.1+cpu --progress-bar off - pip install jax[cpu] --progress-bar off pip install --no-deps -e "." --progress-bar off - name: Test with pytest env: KERAS_BACKEND: ${{ matrix.backend }} run: | pytest keras_nlp/ - format: + check_format: name: Check the code format runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python 3.9 uses: actions/setup-python@v1 with: diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml new file mode 100644 index 0000000000..677a641658 --- /dev/null +++ b/.github/workflows/nightly.yml @@ -0,0 +1,49 @@ +name: Nightly + +on: + workflow_dispatch: # To Generate wheels on demand outside of schedule. + schedule: + - cron: '0 3 * * *' # run at 3 AM UTC / 8 PM PDT + +permissions: + contents: read + +jobs: + run-test-for-nightly: + uses: ./.github/workflows/actions.yml + nightly: + name: Build Wheel file and upload + needs: [run-test-for-nightly] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Get pip cache dir + id: pip-cache + run: | + python -m pip install --upgrade pip setuptools + echo "::set-output name=dir::$(pip cache dir)" + - name: pip cache + uses: actions/cache@v2 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }} + restore-keys: | + ${{ runner.os }}-pip- + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools + pip install twine + pip install -r requirements.txt --progress-bar off + - name: Build wheel file + run: | + python pip_build.py --nightly + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_NIGHTLY_API_TOKEN }} + packages-dir: dist/ + verbose: true diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index a1774c9057..c3f6767350 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -1,20 +1,40 @@ name: Publish to PyPI on: push + +permissions: + contents: read + jobs: build-and-publish: name: Build and publish to PyPI runs-on: ubuntu-latest steps: - - uses: actions/checkout@master - - name: Install dependencies - run: | - pip install -r requirements.txt --progress-bar off - - name: Build a binary wheel and a source tarball - run: >- - python pip_build.py - - name: Publish distribution to PyPI - if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{ secrets.PYPI_API_TOKEN }} + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Get pip cache dir + id: pip-cache + run: | + python -m pip install --upgrade pip setuptools + echo "::set-output name=dir::$(pip cache dir)" + - name: pip cache + uses: actions/cache@v2 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }} + restore-keys: | + ${{ runner.os }}-pip- + - name: Install dependencies + run: | + pip install -r requirements.txt --progress-bar off + - name: Build a binary wheel and a source tarball + run: >- + python pip_build.py + - name: Publish distribution to PyPI + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.kokoro/README.md b/.kokoro/README.md new file mode 100644 index 0000000000..b1fae5ee5a --- /dev/null +++ b/.kokoro/README.md @@ -0,0 +1 @@ +CI to run on PR and merge to Master and for continous build. \ No newline at end of file diff --git a/.kokoro/github/ubuntu/gpu/build.sh b/.kokoro/github/ubuntu/gpu/build.sh new file mode 100644 index 0000000000..2017b77c82 --- /dev/null +++ b/.kokoro/github/ubuntu/gpu/build.sh @@ -0,0 +1,56 @@ +set -e +set -x + +cd "${KOKORO_ROOT}/" + +sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 + +PYTHON_BINARY="/usr/bin/python3.9" + +"${PYTHON_BINARY}" -m venv venv +source venv/bin/activate +# Check the python version +python --version +python3 --version + +export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:" +# Check cuda +nvidia-smi +nvcc --version + +cd "src/github/keras-nlp" +pip install -U pip setuptools psutil + +if [ "${KERAS2:-0}" == "1" ] +then + echo "Keras2 detected." + pip install -r requirements-common.txt --progress-bar off + pip install tensorflow-text==2.15 tensorflow[and-cuda]~=2.15 keras-core + +elif [ "$KERAS_BACKEND" == "tensorflow" ] +then + echo "TensorFlow backend detected." + pip install -r requirements-tensorflow-cuda.txt --progress-bar off + +elif [ "$KERAS_BACKEND" == "jax" ] +then + echo "JAX backend detected." + pip install -r requirements-jax-cuda.txt --progress-bar off + +elif [ "$KERAS_BACKEND" == "torch" ] +then + echo "PyTorch backend detected." + pip install -r requirements-torch-cuda.txt --progress-bar off +fi + +pip install --no-deps -e "." --progress-bar off + +# Run Extra Large Tests for Continuous builds +if [ "${RUN_XLARGE:-0}" == "1" ] +then + pytest keras_nlp --check_gpu --run_large --run_extra_large \ + --cov=keras-nlp +else + pytest keras_nlp --check_gpu --run_large \ + --cov=keras-nlp +fi \ No newline at end of file diff --git a/.kokoro/github/ubuntu/gpu/jax/continuous.cfg b/.kokoro/github/ubuntu/gpu/jax/continuous.cfg new file mode 100644 index 0000000000..1b9ffb605a --- /dev/null +++ b/.kokoro/github/ubuntu/gpu/jax/continuous.cfg @@ -0,0 +1,16 @@ +build_file: "keras-nlp/.kokoro/github/ubuntu/gpu/build.sh" + +action { + define_artifacts { + regex: "**/sponge_log.log" + regex: "**/sponge_log.xml" + } +} + +env_vars: { + key: "KERAS_BACKEND" + value: "jax" +} + +# Set timeout to 60 mins from default 180 mins +timeout_mins: 60 \ No newline at end of file diff --git a/.kokoro/github/ubuntu/gpu/jax/presubmit.cfg b/.kokoro/github/ubuntu/gpu/jax/presubmit.cfg new file mode 100644 index 0000000000..1b9ffb605a --- /dev/null +++ b/.kokoro/github/ubuntu/gpu/jax/presubmit.cfg @@ -0,0 +1,16 @@ +build_file: "keras-nlp/.kokoro/github/ubuntu/gpu/build.sh" + +action { + define_artifacts { + regex: "**/sponge_log.log" + regex: "**/sponge_log.xml" + } +} + +env_vars: { + key: "KERAS_BACKEND" + value: "jax" +} + +# Set timeout to 60 mins from default 180 mins +timeout_mins: 60 \ No newline at end of file diff --git a/.kokoro/github/ubuntu/gpu/keras2/continuous.cfg b/.kokoro/github/ubuntu/gpu/keras2/continuous.cfg new file mode 100644 index 0000000000..7e971ac96d --- /dev/null +++ b/.kokoro/github/ubuntu/gpu/keras2/continuous.cfg @@ -0,0 +1,16 @@ +build_file: "keras-nlp/.kokoro/github/ubuntu/gpu/build.sh" + +action { + define_artifacts { + regex: "**/sponge_log.log" + regex: "**/sponge_log.xml" + } +} + +env_vars: { + key: "KERAS2" + value: "1" +} + +# Set timeout to 60 mins from default 180 mins +timeout_mins: 60 \ No newline at end of file diff --git a/.kokoro/github/ubuntu/gpu/keras2/presubmit.cfg b/.kokoro/github/ubuntu/gpu/keras2/presubmit.cfg new file mode 100644 index 0000000000..7e971ac96d --- /dev/null +++ b/.kokoro/github/ubuntu/gpu/keras2/presubmit.cfg @@ -0,0 +1,16 @@ +build_file: "keras-nlp/.kokoro/github/ubuntu/gpu/build.sh" + +action { + define_artifacts { + regex: "**/sponge_log.log" + regex: "**/sponge_log.xml" + } +} + +env_vars: { + key: "KERAS2" + value: "1" +} + +# Set timeout to 60 mins from default 180 mins +timeout_mins: 60 \ No newline at end of file diff --git a/.kokoro/github/ubuntu/gpu/tensorflow/continuous.cfg b/.kokoro/github/ubuntu/gpu/tensorflow/continuous.cfg new file mode 100644 index 0000000000..b85ee6f4eb --- /dev/null +++ b/.kokoro/github/ubuntu/gpu/tensorflow/continuous.cfg @@ -0,0 +1,16 @@ +build_file: "keras-nlp/.kokoro/github/ubuntu/gpu/build.sh" + +action { + define_artifacts { + regex: "**/sponge_log.log" + regex: "**/sponge_log.xml" + } +} + +env_vars: { + key: "KERAS_BACKEND" + value: "tensorflow" +} + +# Set timeout to 60 mins from default 180 mins +timeout_mins: 60 \ No newline at end of file diff --git a/.kokoro/github/ubuntu/gpu/tensorflow/presubmit.cfg b/.kokoro/github/ubuntu/gpu/tensorflow/presubmit.cfg new file mode 100644 index 0000000000..b85ee6f4eb --- /dev/null +++ b/.kokoro/github/ubuntu/gpu/tensorflow/presubmit.cfg @@ -0,0 +1,16 @@ +build_file: "keras-nlp/.kokoro/github/ubuntu/gpu/build.sh" + +action { + define_artifacts { + regex: "**/sponge_log.log" + regex: "**/sponge_log.xml" + } +} + +env_vars: { + key: "KERAS_BACKEND" + value: "tensorflow" +} + +# Set timeout to 60 mins from default 180 mins +timeout_mins: 60 \ No newline at end of file diff --git a/.kokoro/github/ubuntu/gpu/torch/continuous.cfg b/.kokoro/github/ubuntu/gpu/torch/continuous.cfg new file mode 100644 index 0000000000..5d25106b3f --- /dev/null +++ b/.kokoro/github/ubuntu/gpu/torch/continuous.cfg @@ -0,0 +1,16 @@ +build_file: "keras-nlp/.kokoro/github/ubuntu/gpu/build.sh" + +action { + define_artifacts { + regex: "**/sponge_log.log" + regex: "**/sponge_log.xml" + } +} + +env_vars: { + key: "KERAS_BACKEND" + value: "torch" +} + +# Set timeout to 60 mins from default 180 mins +timeout_mins: 60 \ No newline at end of file diff --git a/.kokoro/github/ubuntu/gpu/torch/presubmit.cfg b/.kokoro/github/ubuntu/gpu/torch/presubmit.cfg new file mode 100644 index 0000000000..5d25106b3f --- /dev/null +++ b/.kokoro/github/ubuntu/gpu/torch/presubmit.cfg @@ -0,0 +1,16 @@ +build_file: "keras-nlp/.kokoro/github/ubuntu/gpu/build.sh" + +action { + define_artifacts { + regex: "**/sponge_log.log" + regex: "**/sponge_log.xml" + } +} + +env_vars: { + key: "KERAS_BACKEND" + value: "torch" +} + +# Set timeout to 60 mins from default 180 mins +timeout_mins: 60 \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d5af724c00..394a1fb148 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,7 +81,7 @@ Once the pull request is approved, a team member will take care of merging. ## Setting up an Environment -Python 3.7 or later is required. +Python 3.9 or later is required. Setting up your KerasNLP development environment requires you to fork the KerasNLP repository and clone it locally. With the @@ -93,72 +93,57 @@ cd keras-nlp ``` Next we must setup a python environment with the correct dependencies. We -recommend using `conda` to install tensorflow dependencies (such as CUDA), and -`pip` to install python packages from PyPI. The exact method will depend on your -OS. +recommend using `conda` to set up a base environment, and `pip` to install +python packages from PyPI. The exact method will depend on your OS. -**Note**: Please be careful not to use the `tensorflow` pre-packaged with conda, -which is incompatible with `tensorflow-text` on PyPi, and follow the -instructions below. +**Note**: Be careful not to use mix pre-packaged tensorflow and jax libraries in +`conda` with PyPI packages from `pip`. We recommend pulling *all* KerasNLP +dependencies via `pip` as described below. ### Linux (recommended) -To setup a complete environment with TensorFlow, a local install of keras-nlp, -and all development tools, run the following or adapt it to suit your needs. +For developing and unit testing the library, a CPU-only environment is often +sufficient. For any training or inference with the library, you will quickly +want accelerator support. The easiest way to get GPU support across all of our +backends is to set up a few different python environements and pull in all cuda +dependencies via `pip`. + +The shell snippet below will install four conda environments: `keras-nlp-cpu`, +`keras-nlp-jax`, `keras-nlp-torch`, and `keras-nlp-tensorflow`. The cpu +environement supports all backends without cuda, and each backend environement +has cuda support. ```shell -# Create and activate conda environment. -conda create -n keras-nlp python=3.9 -conda activate keras-nlp - -# The following can be omitted if GPU support is not required. -conda install -c conda-forge cudatoolkit-dev=11.2 cudnn=8.1.0 -mkdir -p $CONDA_PREFIX/etc/conda/activate.d/ -echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh -echo 'export XLA_FLAGS=--xla_gpu_cuda_data_dir=$CONDA_PREFIX/' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh -source $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - -# Install dependencies. -python -m pip install --upgrade pip -python -m pip install -r requirements.txt -python -m pip install -e "." +conda create -y -n keras-nlp-cpu python=3.10 +conda activate keras-nlp-cpu +pip install -r requirements.txt # install deps +python pip_build.py --install # install keras-nlp + +for backend in "jax" "torch" "tensorflow"; do + conda create -y -n keras-nlp-${backend} python=3.10 + conda activate keras-nlp-${backend} + pip install -r requirements-${backend}-cuda.txt # install deps + python pip_build.py --install # install keras-nlp +done ``` -### MacOS - -⚠️⚠️⚠️ MacOS binaries are for the M1 architecture are not currently available from -official sources. You can try experimental development workflow leveraging the -[tensorflow metal plugin](https://developer.apple.com/metal/tensorflow-plugin/) -and a [community maintained build](https://github.com/sun1638650145/Libraries-and-Extensions-for-TensorFlow-for-Apple-Silicon) -of `tensorflow-text`. These binaries are not provided by Google, so proceed at -your own risk. - -#### Experimental instructions for Arm (M1) +To activate the jax environment and set keras to use jax, run: ```shell -# Create and activate conda environment. -conda create -n keras-nlp python=3.9 -conda activate keras-nlp - -# Install dependencies. -conda install -c apple tensorflow-deps=2.9 -python -m pip install --upgrade pip -python -m pip install -r requirements-macos-m1.txt -python -m pip install -e "." +conda activate keras-nlp-jax && export KERAS_BACKEND=jax ``` -#### Instructions for x86 (Intel) +### MacOS -```shell -# Create and activate conda environment. -conda create -n keras-nlp python=3.9 -conda activate keras-nlp - -# Install dependencies. -python -m pip install --upgrade pip -python -m pip install -r requirements.txt -python -m pip install -e "." -``` +`tensorflow-text` does not release precompiled binaries for MacOS M-series +chips, though the library does support building from source on MacOS. + +We strongly recommend a Linux development environment for an easy contribution +experience. To build a dev environement from scratch on MacOS, see the following +guides: + +- https://developer.apple.com/metal/tensorflow-plugin/ +- https://github.com/tensorflow/text ### Windows diff --git a/CONTRIBUTING_MODELS.md b/CONTRIBUTING_MODELS.md index de5e71af7a..40028aac15 100644 --- a/CONTRIBUTING_MODELS.md +++ b/CONTRIBUTING_MODELS.md @@ -35,7 +35,6 @@ Keep this checklist handy! ### Step 4: PR #3 - Add XX Presets - [ ] An `xx/xx_presets.py` file with links to weights uploaded to a personal GCP bucket/Google Drive \[[Example](https://github.com/keras-team/keras-nlp/blob/master/keras_nlp/models/distil_bert/distil_bert_presets.py)\]. -- [ ] An `xx/xx_presets_test.py` file with runnable tests for each preset \[[Example](https://github.com/keras-team/keras-nlp/blob/master/keras_nlp/models/distil_bert/distil_bert_presets_test.py)\]. - [ ] A `tools/checkpoint_conversion/convert_xx_checkpoints.py` which is reusable script for converting checkpoints \[[Example](https://github.com/keras-team/keras-nlp/blob/master/tools/checkpoint_conversion/convert_distilbert_checkpoints.py)\]. - [ ] A Colab notebook link in the PR description, showing an end-to-end task such as text classification, etc. The task model can be built using the backbone model, with the task head on top \[[Example](https://gist.github.com/mattdangerw/bf0ca07fb66b6738150c8b56ee5bab4e)\]. diff --git a/README.md b/README.md index abf7e38813..4d41a8685e 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,25 @@ # KerasNLP: Modular NLP Workflows for Keras [![](https://github.com/keras-team/keras-nlp/workflows/Tests/badge.svg?branch=master)](https://github.com/keras-team/keras-nlp/actions?query=workflow%3ATests+branch%3Amaster) -![Python](https://img.shields.io/badge/python-v3.8.0+-success.svg) -![Tensorflow](https://img.shields.io/badge/tensorflow-v2.5.0+-success.svg) +![Python](https://img.shields.io/badge/python-v3.9.0+-success.svg) [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/keras-team/keras-nlp/issues) -KerasNLP is a natural language processing library that works natively -with TensorFlow, JAX, or PyTorch. Built on [Keras Core](https://keras.io/keras_core/announcement/), -these models, layers, metrics, callbacks, etc., can be trained and serialized -in any framework and re-used in another without costly migrations. See "Using -KerasNLP with Keras Core" below for more details on multi-framework KerasNLP. - -KerasNLP supports users through their entire development cycle. Our workflows -are built from modular components that have state-of-the-art preset weights and -architectures when used out-of-the-box and are easily customizable when more -control is needed. - -This library is an extension of the core Keras API; all high-level modules are -[`Layers`](https://keras.io/api/layers/) or -[`Models`](https://keras.io/api/models/) that receive that same level of polish -as core Keras. If you are familiar with Keras, congratulations! You already -understand most of KerasNLP. +KerasNLP is a natural language processing library that works natively +with TensorFlow, JAX, or PyTorch. Built on Keras 3, these models, layers, +metrics, and tokenizers can be trained and serialized in any framework and +re-used in another without costly migrations. + +KerasNLP supports users through their entire development cycle. Our workflows +are built from modular components that have state-of-the-art preset weights when +used out-of-the-box and are easily customizable when more control is needed. -See our [Getting Started guide](https://keras.io/guides/keras_nlp/getting_started) -for example usage of our modular API starting with evaluating pretrained models -and building up to designing a novel transformer architecture and training a -tokenizer from scratch. +This library is an extension of the core Keras API; all high-level modules are +[`Layers`](https://keras.io/api/layers/) or +[`Models`](https://keras.io/api/models/) that receive that same level of polish +as core Keras. If you are familiar with Keras, congratulations! You already +understand most of KerasNLP. -We are a new and growing project and welcome [contributions](CONTRIBUTING.md). +See our [Getting Started guide](https://keras.io/guides/keras_nlp/getting_started) +to start learning our API. We welcome [contributions](CONTRIBUTING.md). ## Quick Links @@ -47,53 +40,52 @@ We are a new and growing project and welcome [contributions](CONTRIBUTING.md). ## Installation -To install the latest official release: +KerasNLP supports both Keras 2 and Keras 3. We recommend Keras 3 for all new +users, as it enables using KerasNLP models and layers with JAX, TensorFlow and +PyTorch. -``` -pip install keras-nlp --upgrade -``` +### Keras 2 Installation -To install the latest unreleased changes to the library, we recommend using -pip to install directly from the master branch on github: +To install the latest KerasNLP release with Keras 2, simply run: ``` -pip install git+https://github.com/keras-team/keras-nlp.git --upgrade +pip install --upgrade keras-nlp ``` -## Using KerasNLP with Keras Core -As of version `0.6.0`, KerasNLP supports multiple backends with Keras Core out -of the box. There are two ways to configure KerasNLP to run with multi-backend -support: +### Keras 3 Installation -1. Via the `KERAS_BACKEND` environment variable. If set, then KerasNLP will be -using Keras Core with the backend specified (e.g., `KERAS_BACKEND=jax`). -2. Via the `.keras/keras.json` and `.keras/keras_nlp.json` config files (which -are automatically created the first time you import KerasNLP): - - Set your backend of choice in `.keras/keras.json`; e.g., `"backend": "jax"`. - - Set `"multi_backend": True` in `.keras/keras_nlp.json`. +There are currently two ways to install Keras 3 with KerasNLP. To install the +stable versions of KerasNLP and Keras 3, you should install Keras 3 **after** +installing KerasNLP. This is a temporary step while TensorFlow is pinned to +Keras 2, and will no longer be necessary after TensorFlow 2.16. -Once that configuration step is done, you can just import KerasNLP and start -using it on top of your backend of choice: +``` +pip install --upgrade keras-nlp +pip install --upgrade keras>=3 +``` -```python -import keras_nlp +To install the latest nightly changes for both KerasNLP and Keras, you can use +our nightly package. -gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset("gpt2_base_en") -gpt2_lm.generate("My trip to Yosemite was", max_length=200) ``` +pip install --upgrade keras-nlp-nightly +``` + +> [!IMPORTANT] +> Keras 3 will not function with TensorFlow 2.14 or earlier. -Until Keras Core is officially released as Keras 3.0, KerasNLP will use -`tf.keras` as the default backend. To restore this default behavior, simply -`unset KERAS_BACKEND` and ensure that `"multi_backend": False` or is unset in -`.keras/keras_nlp.json`. You will need to restart the Python runtime for changes -to take effect. +Read [Getting started with Keras](https://keras.io/getting_started/) for more information +on installing Keras 3 and compatibility with different frameworks. ## Quickstart -Fine-tune BERT on a small sentiment analysis task using the +Fine-tune BERT on a small sentiment analysis task using the [`keras_nlp.models`](https://keras.io/api/keras_nlp/models/) API: ```python +import os +os.environ["KERAS_BACKEND"] = "tensorflow" # Or "jax" or "torch"! + import keras_nlp import tensorflow_datasets as tfds @@ -107,6 +99,7 @@ imdb_train, imdb_test = tfds.load( classifier = keras_nlp.models.BertClassifier.from_preset( "bert_base_en_uncased", num_classes=2, + activation="softmax", ) # Fine-tune on IMDb movie reviews. classifier.fit(imdb_train, validation_data=imdb_test) @@ -116,6 +109,29 @@ classifier.predict(["What an amazing movie!", "A total waste of my time."]) For more in depth guides and examples, visit https://keras.io/keras_nlp/. +## Configuring your backend + +If you have Keras 3 installed in your environment (see installation above), +you can use KerasNLP with any of JAX, TensorFlow and PyTorch. To do so, set the +`KERAS_BACKEND` environment variable. For example: + +```shell +export KERAS_BACKEND=jax +``` + +Or in Colab, with: + +```python +import os +os.environ["KERAS_BACKEND"] = "jax" + +import keras_nlp +``` + +> [!IMPORTANT] +> Make sure to set the `KERAS_BACKEND` before import any Keras libraries, it +> will be used to set up Keras when it is first imported. + ## Compatibility We follow [Semantic Versioning](https://semver.org/), and plan to diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..09b1bcfe83 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,18 @@ +# Security Policy + +If you have discovered a security vulnerability in this project, please report it +privately. **Do not disclose it as a public issue.** This gives us time to work with you +to fix the issue before public exposure, reducing the chance that the exploit will be +used before a patch is released. + +You may submit the report in the following ways: + +- send a [private vulnerability report](https://github.com/keras-team/keras-nlp/security/advisories/new) + +Please provide the following information in your report: + +- A description of the vulnerability and its impact +- How to reproduce the issue + +This project is maintained by volunteers on a reasonable-effort basis. As such, +please give us 90 days to work on a fix before public exposure. diff --git a/keras_nlp/__init__.py b/keras_nlp/__init__.py index de7ae1c180..30f8a53b16 100644 --- a/keras_nlp/__init__.py +++ b/keras_nlp/__init__.py @@ -26,6 +26,5 @@ from keras_nlp import samplers from keras_nlp import tokenizers from keras_nlp import utils - -# This is the global source of truth for the version number. -__version__ = "0.6.0.dev0" +from keras_nlp.version_utils import __version__ +from keras_nlp.version_utils import version diff --git a/keras_nlp/backend/__init__.py b/keras_nlp/backend/__init__.py index cf1c63c2a9..1ffbde75a5 100644 --- a/keras_nlp/backend/__init__.py +++ b/keras_nlp/backend/__init__.py @@ -14,14 +14,16 @@ """ Keras backend module. -This module adds a temporarily Keras API surface that is fully under KerasNLP -control. This allows us to switch between `keras_core` and `tf.keras`, as well -as add shims to support older version of `tf.keras`. +This module adds a temporary Keras API surface that is fully under KerasNLP +control. The goal is to allow us to write Keras 3-like code everywhere, while +still supporting Keras 2. We do this by using the `keras_core` package with +Keras 2 to backport Keras 3 numerics APIs (`keras.ops` and `keras.random`) into +Keras 2. The sub-modules exposed are as follows: -- `config`: check which backend is being run. -- `keras`: The full `keras` API (via `keras_core` or `tf.keras`). -- `ops`: `keras_core.ops`, always tf backed if using `tf.keras`. -- `random`: `keras_core.random`, always tf backed if using `tf.keras`. +- `config`: check which version of Keras is being run. +- `keras`: The full `keras` API with compat shims for older Keras versions. +- `ops`: `keras.ops` for Keras 3 or `keras_core.ops` for Keras 2. +- `random`: `keras.random` for Keras 3 or `keras_core.ops` for Keras 2. """ from keras_nlp.backend import config diff --git a/keras_nlp/backend/config.py b/keras_nlp/backend/config.py index 578e1746b7..be3fe23335 100644 --- a/keras_nlp/backend/config.py +++ b/keras_nlp/backend/config.py @@ -12,63 +12,52 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import os -import keras_core -_MULTI_BACKEND = False +def detect_if_tensorflow_uses_keras_3(): + # We follow the version of keras that tensorflow is configured to use. + try: + from tensorflow import keras -# Set Keras base dir path given KERAS_HOME env variable, if applicable. -# Otherwise either ~/.keras or /tmp. -if "KERAS_HOME" in os.environ: - _keras_dir = os.environ.get("KERAS_HOME") -else: - _keras_base_dir = os.path.expanduser("~") - if not os.access(_keras_base_dir, os.W_OK): - _keras_base_dir = "/tmp" - _keras_dir = os.path.join(_keras_base_dir, ".keras") + # Note that only recent versions of keras have a `version()` function. + if hasattr(keras, "version") and keras.version().startswith("3."): + return True + except: + raise ValueError( + "Unable to import `keras` with `tensorflow`. Please check your " + "Keras and Tensorflow version are compatible; Keras 3 requires " + "TensorFlow 2.15 or later. See keras.io/getting_started for more " + "information on installing Keras." + ) -# Attempt to read KerasNLP config file. -_config_path = os.path.expanduser(os.path.join(_keras_dir, "keras_nlp.json")) -if os.path.exists(_config_path): - try: - with open(_config_path) as f: - _config = json.load(f) - except ValueError: - _config = {} - _MULTI_BACKEND = _config.get("multi_backend", _MULTI_BACKEND) + # No `keras.version()` means we are on an old version of keras. + return False -# Save config file, if possible. -if not os.path.exists(_keras_dir): - try: - os.makedirs(_keras_dir) - except OSError: - # Except permission denied and potential race conditions - # in multi-threaded environments. - pass -if not os.path.exists(_config_path): - _config = { - "multi_backend": _MULTI_BACKEND, - } - try: - with open(_config_path, "w") as f: - f.write(json.dumps(_config, indent=4)) - except IOError: - # Except permission denied. - pass +_USE_KERAS_3 = detect_if_tensorflow_uses_keras_3() -# Use keras-core if KERAS_BACKEND is set in the environment. -if "KERAS_BACKEND" in os.environ and os.environ["KERAS_BACKEND"]: - _MULTI_BACKEND = True +if not _USE_KERAS_3: + backend = os.environ.get("KERAS_BACKEND") + if backend and backend != "tensorflow": + raise RuntimeError( + "When running Keras 2, the `KERAS_BACKEND` environment variable " + f"must either be unset or `'tensorflow'`. Received: `{backend}`. " + "To set another backend, please install Keras 3. See " + "https://github.com/keras-team/keras-nlp#installation" + ) -def multi_backend(): - """Check if keras_core is enabled.""" - return _MULTI_BACKEND +def keras_3(): + """Check if Keras 3 is being used.""" + return _USE_KERAS_3 def backend(): """Check the backend framework.""" - return "tensorflow" if not multi_backend() else keras_core.config.backend() + if not keras_3(): + return "tensorflow" + + import keras + + return keras.config.backend() diff --git a/keras_nlp/backend/keras.py b/keras_nlp/backend/keras.py index 85f0ebcb5b..c248438f33 100644 --- a/keras_nlp/backend/keras.py +++ b/keras_nlp/backend/keras.py @@ -16,10 +16,10 @@ import tensorflow as tf -from keras_nlp.backend.config import multi_backend +from keras_nlp.backend import config -if multi_backend(): - from keras_core import * # noqa: F403, F401 +if config.keras_3(): + from keras import * # noqa: F403, F401 else: from tensorflow.keras import * # noqa: F403, F401 diff --git a/keras_nlp/backend/ops.py b/keras_nlp/backend/ops.py index 516834b6e5..f36a2d2d05 100644 --- a/keras_nlp/backend/ops.py +++ b/keras_nlp/backend/ops.py @@ -12,22 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import keras_core -import tensorflow as tf +from keras_nlp.backend import config -from keras_nlp.backend.config import multi_backend - -if multi_backend(): - from keras_core.src.ops import * # noqa: F403, F401 +if config.keras_3(): + from keras.ops import * # noqa: F403, F401 else: - from keras_core.src.backend.tensorflow import * # noqa: F403, F401 - from keras_core.src.backend.tensorflow.core import * # noqa: F403, F401 - from keras_core.src.backend.tensorflow.math import * # noqa: F403, F401 - from keras_core.src.backend.tensorflow.nn import * # noqa: F403, F401 - from keras_core.src.backend.tensorflow.numpy import * # noqa: F403, F401 - + from keras_core.ops import * # noqa: F403, F401 -if keras_core.config.backend() == "tensorflow" or not multi_backend(): +if config.backend() == "tensorflow": + import tensorflow as tf + from tensorflow.experimental import numpy as tfnp def take_along_axis(x, indices, axis=None): # TODO: move this workaround for dynamic shapes into keras-core. @@ -46,6 +40,4 @@ def take_along_axis(x, indices, axis=None): indices = tf.squeeze(indices, leftover_axes) return tf.gather(x, indices, batch_dims=axis) # Otherwise, fall back to the tfnp call. - return keras_core.src.backend.tensorflow.numpy.take_along_axis( - x, indices, axis=axis - ) + return tfnp.take_along_axis(x, indices, axis=axis) diff --git a/keras_nlp/backend/random.py b/keras_nlp/backend/random.py index 70be5910f4..c4f4e6f467 100644 --- a/keras_nlp/backend/random.py +++ b/keras_nlp/backend/random.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend.config import multi_backend +from keras_nlp.backend import config -if multi_backend(): - from keras_core.random import * # noqa: F403, F401 +if config.keras_3(): + from keras.random import * # noqa: F403, F401 else: - from keras_core.src.backend.tensorflow.random import * # noqa: F403, F401 + from keras_core.random import * # noqa: F403, F401 diff --git a/keras_nlp/conftest.py b/keras_nlp/conftest.py index 6bcda7ed0e..b876a7a0a8 100644 --- a/keras_nlp/conftest.py +++ b/keras_nlp/conftest.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest import tensorflow as tf @@ -21,21 +19,6 @@ from keras_nlp.backend import keras -@pytest.fixture(scope="session") -def tpu_strategy(): - tpu_name = os.getenv("KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS") - resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect( - tpu=tpu_name, - ) - return tf.distribute.TPUStrategy(resolver) - - -@pytest.fixture(scope="class") -def tpu_test_class(request, tpu_strategy): - # set a class attribute on the invoking test context - request.cls.tpu_strategy = tpu_strategy - - def pytest_addoption(parser): parser.addoption( "--run_large", @@ -49,31 +32,44 @@ def pytest_addoption(parser): default=False, help="run extra_large tests", ) - parser.addoption( - "--run_tpu", - action="store_true", - default=False, - help="run tpu tests", - ) - parser.addoption( - "--mixed_precision", - action="store_true", - default=False, - help="run with mixed precision", - ) parser.addoption( "--docstring_module", action="store", default="", help="restrict docs testing to modules whose name matches this flag", ) + parser.addoption( + "--check_gpu", + action="store_true", + default=False, + help="fail if a gpu is not present", + ) def pytest_configure(config): - if config.getoption("--mixed_precision"): - keras.mixed_precision.set_global_policy("mixed_float16") + # Verify that device has GPU and detected by backend + if config.getoption("--check_gpu"): + found_gpu = False + backend = backend_config.backend() + if backend == "jax": + import jax + + try: + found_gpu = bool(jax.devices("gpu")) + except RuntimeError: + found_gpu = False + elif backend == "tensorflow": + found_gpu = bool(tf.config.list_logical_devices("GPU")) + elif backend == "torch": + import torch + + found_gpu = bool(torch.cuda.device_count()) + if not found_gpu: + pytest.fail(f"No GPUs discovered on the {backend} backend.") + config.addinivalue_line( - "markers", "large: mark test as being slow or requiring a network" + "markers", + "large: mark test as being slow or requiring a network", ) config.addinivalue_line( "markers", @@ -81,11 +77,11 @@ def pytest_configure(config): ) config.addinivalue_line( "markers", - "tpu: mark test as tpu test", + "tf_only: mark test as a tf only test", ) config.addinivalue_line( "markers", - "tf_only: mark test as a tf only test", + "keras_3_only: mark test as a keras 3 only test", ) @@ -93,7 +89,6 @@ def pytest_collection_modifyitems(config, items): run_extra_large_tests = config.getoption("--run_extra_large") # Run large tests for --run_extra_large or --run_large. run_large_tests = config.getoption("--run_large") or run_extra_large_tests - run_tpu = config.getoption("--run_tpu") # Messages to annotate skipped tests with. skip_large = pytest.mark.skipif( @@ -104,31 +99,26 @@ def pytest_collection_modifyitems(config, items): not run_extra_large_tests, reason="need --run_extra_large option to run", ) - skip_tpu = pytest.mark.skipif( - not run_tpu, - reason="need --run_tpu option to run", - ) - skip_tf_only = pytest.mark.skipif( + tf_only = pytest.mark.skipif( not backend_config.backend() == "tensorflow", reason="tests only run on tf backend", ) + keras_3_only = pytest.mark.skipif( + not backend_config.keras_3(), + reason="tests only run on with multi-backend keras", + ) for item in items: if "large" in item.keywords: item.add_marker(skip_large) if "extra_large" in item.keywords: item.add_marker(skip_extra_large) - if "tpu" in item.keywords: - item.add_marker(skip_tpu) if "tf_only" in item.keywords: - item.add_marker(skip_tf_only) + item.add_marker(tf_only) + if "keras_3_only" in item.keywords: + item.add_marker(keras_3_only) # Disable traceback filtering for quicker debugging of tests failures. tf.debugging.disable_traceback_filtering() -if backend_config.multi_backend(): +if backend_config.keras_3(): keras.config.disable_traceback_filtering() - -# One off setup for dtensor tests. -if not backend_config.multi_backend(): - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) diff --git a/keras_nlp/layers/__init__.py b/keras_nlp/layers/__init__.py index 105f511552..595c4eb661 100644 --- a/keras_nlp/layers/__init__.py +++ b/keras_nlp/layers/__init__.py @@ -18,6 +18,7 @@ from keras_nlp.layers.modeling.f_net_encoder import FNetEncoder from keras_nlp.layers.modeling.masked_lm_head import MaskedLMHead from keras_nlp.layers.modeling.position_embedding import PositionEmbedding +from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding from keras_nlp.layers.modeling.rotary_embedding import RotaryEmbedding from keras_nlp.layers.modeling.sine_position_encoding import ( SinePositionEncoding, diff --git a/keras_nlp/layers/modeling/cached_multi_head_attention.py b/keras_nlp/layers/modeling/cached_multi_head_attention.py index 16124328d9..3f30cb16ad 100644 --- a/keras_nlp/layers/modeling/cached_multi_head_attention.py +++ b/keras_nlp/layers/modeling/cached_multi_head_attention.py @@ -86,6 +86,7 @@ def call( ): if ( hasattr(self, "_build_from_signature") + and hasattr(self, "_built_from_signature") and not self._built_from_signature ): self._build_from_signature(query=query, value=value, key=key) diff --git a/keras_nlp/layers/modeling/cached_multi_head_attention_test.py b/keras_nlp/layers/modeling/cached_multi_head_attention_test.py index fdaab606d3..052ce66ec1 100644 --- a/keras_nlp/layers/modeling/cached_multi_head_attention_test.py +++ b/keras_nlp/layers/modeling/cached_multi_head_attention_test.py @@ -14,6 +14,7 @@ from keras_nlp.backend import config from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.cached_multi_head_attention import ( CachedMultiHeadAttention, ) @@ -23,21 +24,21 @@ class CachedMultiHeadAttentionTest(TestCase): def test_layer_behaviors(self): self.run_layer_test( - layer_cls=CachedMultiHeadAttention, + cls=CachedMultiHeadAttention, init_kwargs={ "num_heads": 2, "key_dim": 4, }, input_data={ - "query": ops.random.uniform(shape=(2, 4, 6)), - "value": ops.random.uniform(shape=(2, 4, 6)), + "query": random.uniform(shape=(2, 4, 6)), + "value": random.uniform(shape=(2, 4, 6)), }, expected_output_shape=(2, 4, 6), expected_num_trainable_weights=8, expected_num_non_trainable_variables=1, - # tf.keras does not handle mixed precision correctly when not set + # Keras 2 does not handle mixed precision correctly when not set # globally. - run_mixed_precision_check=config.multi_backend(), + run_mixed_precision_check=config.keras_3(), ) def test_cache_call_is_correct(self): @@ -48,7 +49,7 @@ def test_cache_call_is_correct(self): hidden_dim = num_heads * key_dim input_shape = (batch_size, seq_len, hidden_dim) - x = ops.random.uniform(shape=input_shape) + x = random.uniform(shape=input_shape) input_cache = ops.zeros((batch_size, 2, seq_len, num_heads, key_dim)) # Use a causal mask. mask = ops.tril(ops.ones((seq_len, seq_len))) diff --git a/keras_nlp/layers/modeling/f_net_encoder_test.py b/keras_nlp/layers/modeling/f_net_encoder_test.py index ffafc8a740..e5d0b1ea77 100644 --- a/keras_nlp/layers/modeling/f_net_encoder_test.py +++ b/keras_nlp/layers/modeling/f_net_encoder_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.f_net_encoder import FNetEncoder from keras_nlp.tests.test_case import TestCase @@ -20,7 +20,7 @@ class FNetEncoderTest(TestCase): def test_layer_behaviors(self): self.run_layer_test( - layer_cls=FNetEncoder, + cls=FNetEncoder, init_kwargs={ "intermediate_dim": 4, "dropout": 0, @@ -29,7 +29,7 @@ def test_layer_behaviors(self): "kernel_initializer": "HeNormal", "bias_initializer": "Zeros", }, - input_data=ops.random.uniform(shape=(2, 4, 6)), + input_data=random.uniform(shape=(2, 4, 6)), expected_output_shape=(2, 4, 6), expected_num_trainable_weights=8, expected_num_non_trainable_variables=1, diff --git a/keras_nlp/layers/modeling/masked_lm_head_test.py b/keras_nlp/layers/modeling/masked_lm_head_test.py index 9ccdcae196..8d22ea0343 100644 --- a/keras_nlp/layers/modeling/masked_lm_head_test.py +++ b/keras_nlp/layers/modeling/masked_lm_head_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.masked_lm_head import MaskedLMHead from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding from keras_nlp.tests.test_case import TestCase @@ -21,7 +21,7 @@ class MaskedLMHeadTest(TestCase): def test_layer_behaviors(self): self.run_layer_test( - layer_cls=MaskedLMHead, + cls=MaskedLMHead, init_kwargs={ "vocabulary_size": 100, "activation": "softmax", @@ -29,8 +29,8 @@ def test_layer_behaviors(self): "bias_initializer": "Zeros", }, input_data={ - "inputs": ops.random.uniform(shape=(4, 10, 16)), - "mask_positions": ops.random.randint( + "inputs": random.uniform(shape=(4, 10, 16)), + "mask_positions": random.randint( minval=0, maxval=10, shape=(4, 5) ), }, @@ -42,7 +42,7 @@ def test_layer_behaviors_with_embedding(self): embedding = ReversibleEmbedding(100, 16) embedding.build((4, 10)) self.run_layer_test( - layer_cls=MaskedLMHead, + cls=MaskedLMHead, init_kwargs={ "vocabulary_size": 100, "activation": "softmax", @@ -51,8 +51,8 @@ def test_layer_behaviors_with_embedding(self): "token_embedding": embedding, }, input_data={ - "inputs": ops.random.uniform(shape=(4, 10, 16)), - "mask_positions": ops.random.randint( + "inputs": random.uniform(shape=(4, 10, 16)), + "mask_positions": random.randint( minval=0, maxval=10, shape=(4, 5) ), }, diff --git a/keras_nlp/layers/modeling/position_embedding_test.py b/keras_nlp/layers/modeling/position_embedding_test.py index ec16f541d3..549411e0b8 100644 --- a/keras_nlp/layers/modeling/position_embedding_test.py +++ b/keras_nlp/layers/modeling/position_embedding_test.py @@ -16,6 +16,7 @@ from keras_nlp.backend import keras from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.position_embedding import PositionEmbedding from keras_nlp.tests.test_case import TestCase @@ -30,22 +31,22 @@ def custom_init(shape, dtype=None): class PositionEmbeddingTest(TestCase): def test_layer_behaviors(self): self.run_layer_test( - layer_cls=PositionEmbedding, + cls=PositionEmbedding, init_kwargs={ "sequence_length": 21, }, - input_data=ops.random.uniform(shape=(4, 21, 30)), + input_data=random.uniform(shape=(4, 21, 30)), expected_output_shape=(4, 21, 30), expected_num_trainable_weights=1, ) def test_layer_behaviors_4d(self): self.run_layer_test( - layer_cls=PositionEmbedding, + cls=PositionEmbedding, init_kwargs={ "sequence_length": 21, }, - input_data=ops.random.uniform(shape=(4, 5, 21, 30)), + input_data=random.uniform(shape=(4, 5, 21, 30)), expected_output_shape=(4, 5, 21, 30), expected_num_trainable_weights=1, ) @@ -145,7 +146,7 @@ def test_callable_initializer(self): def test_start_index(self): batch_size, seq_length, feature_size = 2, 3, 4 layer = PositionEmbedding(seq_length) - data = ops.random.uniform(shape=(batch_size, seq_length, feature_size)) + data = random.uniform(shape=(batch_size, seq_length, feature_size)) full_output = layer(data) sequential_output = ops.zeros((batch_size, seq_length, feature_size)) for i in range(seq_length): diff --git a/keras_nlp/layers/modeling/reversible_embedding_test.py b/keras_nlp/layers/modeling/reversible_embedding_test.py index ceb04578db..0875759a77 100644 --- a/keras_nlp/layers/modeling/reversible_embedding_test.py +++ b/keras_nlp/layers/modeling/reversible_embedding_test.py @@ -20,6 +20,7 @@ from keras_nlp.backend import config from keras_nlp.backend import keras from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding from keras_nlp.tests.test_case import TestCase @@ -31,14 +32,14 @@ class ReversibleEmbeddingTest(TestCase): ) def test_layer_behaviors_tied(self, tie_weights): self.run_layer_test( - layer_cls=ReversibleEmbedding, + cls=ReversibleEmbedding, init_kwargs={ "input_dim": 100, "output_dim": 32, "tie_weights": tie_weights, "embeddings_initializer": "HeNormal", }, - input_data=ops.random.randint(minval=0, maxval=100, shape=(4, 10)), + input_data=random.randint(minval=0, maxval=100, shape=(4, 10)), expected_output_shape=(4, 10, 32), expected_num_trainable_weights=1 if tie_weights else 2, ) diff --git a/keras_nlp/layers/modeling/rotary_embedding.py b/keras_nlp/layers/modeling/rotary_embedding.py index b3402f7e21..45f77ce494 100644 --- a/keras_nlp/layers/modeling/rotary_embedding.py +++ b/keras_nlp/layers/modeling/rotary_embedding.py @@ -85,10 +85,7 @@ def __init__( self.built = True def call(self, inputs, start_index=0): - rotary_dim = ops.shape(inputs)[-1] - cos_emb, sin_emb = self._compute_cos_sin_embedding( - inputs, rotary_dim, start_index - ) + cos_emb, sin_emb = self._compute_cos_sin_embedding(inputs, start_index) return self._apply_rotary_pos_emb(inputs, cos_emb, sin_emb) def _apply_rotary_pos_emb(self, tensor, cos_emb, sin_emb): @@ -96,34 +93,44 @@ def _apply_rotary_pos_emb(self, tensor, cos_emb, sin_emb): half_rot_tensor = ops.concatenate((-x2, x1), axis=self.feature_axis) return (tensor * cos_emb) + (half_rot_tensor * sin_emb) - def _compute_cos_sin_embedding(self, x, rotary_dim, start_index): - freq_range = ops.arange(0, rotary_dim, 2, dtype="float32") - freq_range = ops.cast(freq_range, self.compute_dtype) - freq_range = freq_range / ops.cast( - self.scaling_factor, self.compute_dtype - ) - inverse_freq = 1.0 / ( - self.max_wavelength - ** (freq_range / ops.cast(rotary_dim, self.compute_dtype)) - ) - seq_len = ops.shape(x)[self.sequence_axis] - tensor = ops.arange(seq_len, dtype="float32") + start_index - tensor = ops.cast(tensor, dtype=inverse_freq.dtype) - freq = ops.einsum("i, j -> ij", tensor, inverse_freq) - embedding = ops.concatenate((freq, freq), axis=self.feature_axis) - + def _compute_cos_sin_embedding(self, inputs, start_index=0): def get_axis(axis): - return axis if axis > 0 else len(x.shape) + axis + return axis if axis > 0 else len(inputs.shape) + axis feature_axis = get_axis(self.feature_axis) sequence_axis = get_axis(self.sequence_axis) - for axis in range(len(x.shape)): + rotary_dim = ops.shape(inputs)[feature_axis] + inverse_freq = self._get_inverse_freq(rotary_dim) + + seq_len = ops.shape(inputs)[self.sequence_axis] + tensor = ops.cast(ops.arange(seq_len), self.compute_dtype) + start_index + + tensor = ops.cast(tensor, dtype=inverse_freq.dtype) + freq = ops.einsum("i,j->ij", tensor, inverse_freq) + embedding = ops.concatenate((freq, freq), axis=-1) + + # Reshape the embedding to be broadcastable with input shape. + if feature_axis < sequence_axis: + embedding = ops.transpose(embedding) + for axis in range(len(inputs.shape)): if axis != sequence_axis and axis != feature_axis: embedding = ops.expand_dims(embedding, axis) return ops.cos(embedding), ops.sin(embedding) + def _get_inverse_freq(self, rotary_dim): + freq_range = ops.arange(0, rotary_dim, 2) + freq_range = ops.cast(freq_range, self.compute_dtype) + freq_range = freq_range / ops.cast( + self.scaling_factor, self.compute_dtype + ) + inverse_freq = 1.0 / ( + self.max_wavelength + ** (freq_range / ops.cast(rotary_dim, self.compute_dtype)) + ) + return inverse_freq + def get_config(self): config = super().get_config() config.update( diff --git a/keras_nlp/layers/modeling/rotary_embedding_test.py b/keras_nlp/layers/modeling/rotary_embedding_test.py index 3fdac028de..c0fc2906e7 100644 --- a/keras_nlp/layers/modeling/rotary_embedding_test.py +++ b/keras_nlp/layers/modeling/rotary_embedding_test.py @@ -14,6 +14,7 @@ from keras_nlp.backend import keras from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.rotary_embedding import RotaryEmbedding from keras_nlp.tests.test_case import TestCase @@ -21,24 +22,24 @@ class RotaryEmbeddingTest(TestCase): def test_layer_behaviors(self): self.run_layer_test( - layer_cls=RotaryEmbedding, + cls=RotaryEmbedding, init_kwargs={ "max_wavelength": 1000, "scaling_factor": 2.0, "sequence_axis": 1, "feature_axis": -1, }, - input_data=ops.random.uniform(shape=(2, 4, 6)), + input_data=random.uniform(shape=(2, 4, 6)), expected_output_shape=(2, 4, 6), ) def test_layer_behaviors_4d(self): self.run_layer_test( - layer_cls=RotaryEmbedding, + cls=RotaryEmbedding, init_kwargs={ "max_wavelength": 1000, }, - input_data=ops.random.uniform(shape=(2, 8, 4, 6)), + input_data=random.uniform(shape=(2, 8, 4, 6)), expected_output_shape=(2, 8, 4, 6), ) @@ -86,7 +87,7 @@ def test_output_correct_values(self): def test_start_index(self): batch_size, seq_length, feature_size = 2, 3, 4 layer = RotaryEmbedding(seq_length) - data = ops.random.uniform(shape=(batch_size, seq_length, feature_size)) + data = random.uniform(shape=(batch_size, seq_length, feature_size)) full_output = layer(data) sequential_output = ops.zeros((batch_size, seq_length, feature_size)) for i in range(seq_length): @@ -96,6 +97,18 @@ def test_start_index(self): ) self.assertAllClose(full_output, sequential_output) + def test_permuted_axes(self): + batch_size, seq_length, feature_size = 2, 3, 4 + data = random.uniform(shape=(batch_size, seq_length, feature_size)) + layer = RotaryEmbedding(seq_length) + outputs = layer(data) + permuted_data = ops.transpose(data, (0, 2, 1)) + permuted_layer = RotaryEmbedding( + seq_length, sequence_axis=-1, feature_axis=-2 + ) + permuted_outputs = permuted_layer(permuted_data) + self.assertAllClose(outputs, ops.transpose(permuted_outputs, (0, 2, 1))) + def test_float16_dtype(self): embedding_layer = RotaryEmbedding(dtype="float16") seq_length = 100 diff --git a/keras_nlp/layers/modeling/sine_position_encoding_test.py b/keras_nlp/layers/modeling/sine_position_encoding_test.py index 22d1d9d3bf..80dad26cbc 100644 --- a/keras_nlp/layers/modeling/sine_position_encoding_test.py +++ b/keras_nlp/layers/modeling/sine_position_encoding_test.py @@ -14,6 +14,7 @@ from keras_nlp.backend import keras from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.sine_position_encoding import ( SinePositionEncoding, ) @@ -23,21 +24,21 @@ class SinePositionEncodingTest(TestCase): def test_layer_behaviors(self): self.run_layer_test( - layer_cls=SinePositionEncoding, + cls=SinePositionEncoding, init_kwargs={ "max_wavelength": 10000, }, - input_data=ops.random.uniform(shape=(2, 4, 6)), + input_data=random.uniform(shape=(2, 4, 6)), expected_output_shape=(2, 4, 6), ) def test_layer_behaviors_4d(self): self.run_layer_test( - layer_cls=SinePositionEncoding, + cls=SinePositionEncoding, init_kwargs={ "max_wavelength": 10000, }, - input_data=ops.random.uniform(shape=(1, 2, 4, 6)), + input_data=random.uniform(shape=(1, 2, 4, 6)), expected_output_shape=(1, 2, 4, 6), ) @@ -85,7 +86,7 @@ def test_output_correct_values(self): pos_encoding, ] ) - input = ops.random.uniform(shape=[1, 4, 6]) + input = random.uniform(shape=[1, 4, 6]) output = model(input) # comapre position encoding values for position 0 and 3 @@ -97,7 +98,7 @@ def test_output_correct_values(self): def test_start_index(self): batch_size, seq_length, feature_size = 2, 3, 4 layer = SinePositionEncoding() - data = ops.random.uniform(shape=(batch_size, seq_length, feature_size)) + data = random.uniform(shape=(batch_size, seq_length, feature_size)) full_output = layer(data) sequential_output = ops.zeros((batch_size, seq_length, feature_size)) for i in range(seq_length): diff --git a/keras_nlp/layers/modeling/token_and_position_embedding.py b/keras_nlp/layers/modeling/token_and_position_embedding.py index f3dffe345f..bb7107f96f 100644 --- a/keras_nlp/layers/modeling/token_and_position_embedding.py +++ b/keras_nlp/layers/modeling/token_and_position_embedding.py @@ -122,7 +122,7 @@ def get_config(self): ), "tie_weights": self.token_embedding.tie_weights, "mask_zero": self.token_embedding.mask_zero, - }, + } ) return config diff --git a/keras_nlp/layers/modeling/token_and_position_embedding_test.py b/keras_nlp/layers/modeling/token_and_position_embedding_test.py index ceb96b114e..122d74e13d 100644 --- a/keras_nlp/layers/modeling/token_and_position_embedding_test.py +++ b/keras_nlp/layers/modeling/token_and_position_embedding_test.py @@ -14,8 +14,8 @@ import numpy as np -from keras_nlp.backend import keras from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.token_and_position_embedding import ( TokenAndPositionEmbedding, ) @@ -25,14 +25,14 @@ class TokenAndPositionEmbeddingTest(TestCase): def test_layer_behaviors(self): self.run_layer_test( - layer_cls=TokenAndPositionEmbedding, + cls=TokenAndPositionEmbedding, init_kwargs={ "vocabulary_size": 5, "sequence_length": 4, "embedding_dim": 3, - "embeddings_initializer": keras.initializers.Constant(1.0), + "embeddings_initializer": "ones", }, - input_data=ops.random.randint(minval=0, maxval=5, shape=(2, 4)), + input_data=random.randint(minval=0, maxval=5, shape=(2, 4)), expected_output_shape=(2, 4, 3), expected_output_data=ops.ones((2, 4, 3)) * 2, expected_num_trainable_weights=2, diff --git a/keras_nlp/layers/modeling/transformer_decoder.py b/keras_nlp/layers/modeling/transformer_decoder.py index 92734fca2e..3a3cda3f21 100644 --- a/keras_nlp/layers/modeling/transformer_decoder.py +++ b/keras_nlp/layers/modeling/transformer_decoder.py @@ -199,12 +199,12 @@ def build( ) if hasattr(self._cross_attention_layer, "_build_from_signature"): self._cross_attention_layer._build_from_signature( - query=encoder_sequence_shape, + query=decoder_sequence_shape, value=encoder_sequence_shape, ) else: self._cross_attention_layer.build( - query_shape=encoder_sequence_shape, + query_shape=decoder_sequence_shape, value_shape=encoder_sequence_shape, ) self._cross_attention_layer_norm = keras.layers.LayerNormalization( @@ -212,7 +212,7 @@ def build( dtype=self.dtype_policy, name="cross_attention_layer_norm", ) - self._cross_attention_layer_norm.build(encoder_sequence_shape) + self._cross_attention_layer_norm.build(decoder_sequence_shape) self._cross_attention_dropout = keras.layers.Dropout( rate=self.dropout, dtype=self.dtype_policy, diff --git a/keras_nlp/layers/modeling/transformer_decoder_test.py b/keras_nlp/layers/modeling/transformer_decoder_test.py index 12cd189e74..2b54324f02 100644 --- a/keras_nlp/layers/modeling/transformer_decoder_test.py +++ b/keras_nlp/layers/modeling/transformer_decoder_test.py @@ -15,6 +15,7 @@ from absl.testing import parameterized from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.transformer_decoder import TransformerDecoder from keras_nlp.tests.test_case import TestCase @@ -26,7 +27,7 @@ class TransformerDecoderTest(TestCase): ) def test_layer_behaviors(self, normalize_first): self.run_layer_test( - layer_cls=TransformerDecoder, + cls=TransformerDecoder, init_kwargs={ "intermediate_dim": 4, "num_heads": 2, @@ -36,7 +37,7 @@ def test_layer_behaviors(self, normalize_first): "kernel_initializer": "HeNormal", "bias_initializer": "Zeros", }, - input_data=ops.random.uniform(shape=(2, 4, 6)), + input_data=random.uniform(shape=(2, 4, 6)), expected_output_shape=(2, 4, 6), expected_num_trainable_weights=16, expected_num_non_trainable_variables=3, # dropout rng seeds @@ -49,7 +50,7 @@ def test_layer_behaviors(self, normalize_first): def test_layer_behaviors_with_cross_attention(self, normalize_first): pass self.run_layer_test( - layer_cls=TransformerDecoder, + cls=TransformerDecoder, init_kwargs={ "intermediate_dim": 4, "num_heads": 2, @@ -60,8 +61,8 @@ def test_layer_behaviors_with_cross_attention(self, normalize_first): "bias_initializer": "Zeros", }, input_data={ - "decoder_sequence": ops.random.uniform(shape=(2, 4, 6)), - "encoder_sequence": ops.random.uniform(shape=(2, 4, 6)), + "decoder_sequence": random.uniform(shape=(2, 4, 6)), + "encoder_sequence": random.uniform(shape=(2, 4, 6)), }, expected_output_shape=(2, 4, 6), expected_num_trainable_weights=26, @@ -106,8 +107,8 @@ def test_mask_propagation(self): intermediate_dim=4, num_heads=2, ) - decoder_sequence = ops.random.uniform(shape=[1, 4, 6]) - encoder_sequence = ops.random.uniform(shape=[1, 4, 6]) + decoder_sequence = random.uniform(shape=[1, 4, 6]) + encoder_sequence = random.uniform(shape=[1, 4, 6]) mask = ops.array([[True, True, False, False]]) decoder_sequence._keras_mask = mask outputs = decoder(decoder_sequence, encoder_sequence) @@ -118,7 +119,7 @@ def test_mask_propagation_without_cross_attention(self): intermediate_dim=4, num_heads=2, ) - decoder_sequence = ops.random.uniform(shape=[1, 4, 6]) + decoder_sequence = random.uniform(shape=[1, 4, 6]) mask = ops.array([[True, True, False, False]]) decoder_sequence._keras_mask = mask outputs = decoder(decoder_sequence) @@ -132,7 +133,7 @@ def test_cache_call_is_correct(self): hidden_dim = num_heads * key_dim input_shape = (batch_size, seq_len, hidden_dim) - x = ops.random.uniform(shape=input_shape) + x = random.uniform(shape=input_shape) input_cache = ops.zeros((batch_size, 2, seq_len, num_heads, key_dim)) outputs = ops.zeros_like(x) @@ -168,3 +169,12 @@ def call(outputs, cache): output, output_cache = call(outputs, input_cache) self.assertAllClose(output, no_loop_outputs) self.assertAllClose(output_cache, no_loop_cache) + + def test_different_feature_dimension_for_encoder_and_decoder_sequence(self): + decoder = TransformerDecoder( + intermediate_dim=4, + num_heads=2, + ) + decoder_sequence = random.uniform(shape=[1, 4, 6]) + encoder_sequence = random.uniform(shape=[1, 4, 5]) + decoder(decoder_sequence, encoder_sequence) diff --git a/keras_nlp/layers/modeling/transformer_encoder_test.py b/keras_nlp/layers/modeling/transformer_encoder_test.py index 9fe7c3eab2..844125c4b0 100644 --- a/keras_nlp/layers/modeling/transformer_encoder_test.py +++ b/keras_nlp/layers/modeling/transformer_encoder_test.py @@ -16,6 +16,7 @@ from keras_nlp.backend import keras from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.layers.modeling.transformer_encoder import TransformerEncoder from keras_nlp.tests.test_case import TestCase @@ -27,7 +28,7 @@ class TransformerEncoderTest(TestCase): ) def test_layer_behaviors(self, normalize_first): self.run_layer_test( - layer_cls=TransformerEncoder, + cls=TransformerEncoder, init_kwargs={ "intermediate_dim": 4, "num_heads": 2, @@ -37,7 +38,7 @@ def test_layer_behaviors(self, normalize_first): "kernel_initializer": "HeNormal", "bias_initializer": "Zeros", }, - input_data=ops.random.uniform(shape=(2, 4, 6)), + input_data=random.uniform(shape=(2, 4, 6)), expected_output_shape=(2, 4, 6), expected_num_trainable_weights=16, expected_num_non_trainable_variables=3, # dropout rng seeds @@ -59,7 +60,7 @@ def test_valid_call(self, normalize_first): encoder, ] ) - input = ops.random.uniform(shape=[2, 4, 6]) + input = random.uniform(shape=[2, 4, 6]) model(input) def test_valid_call_with_mask(self): @@ -68,7 +69,7 @@ def test_valid_call_with_mask(self): num_heads=2, ) encoder.build([2, 4, 6]) - input = ops.random.uniform(shape=[2, 4, 6]) + input = random.uniform(shape=[2, 4, 6]) mask = input[:, :, 0] < 0.5 encoder(input, mask) @@ -86,7 +87,7 @@ def test_mask_propagation(self): intermediate_dim=4, num_heads=2, ) - inputs = ops.random.uniform(shape=[1, 4, 6]) + inputs = random.uniform(shape=[1, 4, 6]) mask = ops.array([[True, True, False, False]]) inputs._keras_mask = mask outputs = encoder(inputs) diff --git a/keras_nlp/layers/modeling/transformer_layer_utils_test.py b/keras_nlp/layers/modeling/transformer_layer_utils_test.py index 7fc2013ad7..57df677e94 100644 --- a/keras_nlp/layers/modeling/transformer_layer_utils_test.py +++ b/keras_nlp/layers/modeling/transformer_layer_utils_test.py @@ -14,6 +14,7 @@ import keras_nlp.layers.modeling.transformer_layer_utils as utils from keras_nlp.backend import ops +from keras_nlp.backend import random from keras_nlp.tests.test_case import TestCase @@ -25,7 +26,7 @@ def test_compute_causal_mask(self): def test_merge_padding_and_attention_mask(self): padding_mask = ops.array([[1, 1, 0]]) attention_mask = ops.array([[[0, 0, 1], [0, 1, 0], [1, 0, 0]]]) - inputs = ops.random.uniform(shape=[1, 3, 2]) + inputs = random.uniform(shape=[1, 3, 2]) merged_mask = utils.merge_padding_and_attention_mask( inputs, padding_mask, @@ -37,7 +38,7 @@ def test_bad_mask_shapes(self): with self.assertRaises(ValueError): padding_mask = ops.array([[[1, 1, 0], [1, 0, 0]]]) attention_mask = ops.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]]) - inputs = ops.random.uniform(shape=[1, 3, 2]) + inputs = random.uniform(shape=[1, 3, 2]) utils.merge_padding_and_attention_mask( inputs, padding_mask, @@ -47,7 +48,7 @@ def test_bad_mask_shapes(self): with self.assertRaises(ValueError): padding_mask = ops.array([[1, 1, 0]]) attention_mask = ops.array([[0, 0, 1], [1, 0, 0]]) - inputs = ops.random.uniform(shape=[1, 3, 2]) + inputs = random.uniform(shape=[1, 3, 2]) utils.merge_padding_and_attention_mask( inputs, padding_mask, diff --git a/keras_nlp/layers/preprocessing/preprocessing_layer.py b/keras_nlp/layers/preprocessing/preprocessing_layer.py index 63c40713e4..d6101da150 100644 --- a/keras_nlp/layers/preprocessing/preprocessing_layer.py +++ b/keras_nlp/layers/preprocessing/preprocessing_layer.py @@ -29,7 +29,12 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self._convert_input_args = False self._allow_non_tensor_positional_args = True - self.built = True + # Most pre-preprocessing has no build. + if not hasattr(self, "build"): + self.built = True + + def get_build_config(self): + return None def __call__(self, *args, **kwargs): # Always place on CPU for preprocessing, to avoid expensive back and diff --git a/keras_nlp/layers/preprocessing/random_deletion.py b/keras_nlp/layers/preprocessing/random_deletion.py index 41289c0054..061290ba56 100644 --- a/keras_nlp/layers/preprocessing/random_deletion.py +++ b/keras_nlp/layers/preprocessing/random_deletion.py @@ -21,7 +21,7 @@ PreprocessingLayer, ) from keras_nlp.utils.tensor_utils import convert_to_ragged_batch -from keras_nlp.utils.tensor_utils import is_integer_dtype +from keras_nlp.utils.tensor_utils import is_int_dtype from keras_nlp.utils.tensor_utils import is_string_dtype @@ -125,7 +125,7 @@ def __init__( dtype="int32", **kwargs, ): - if not is_integer_dtype(dtype) and not is_string_dtype(dtype): + if not is_int_dtype(dtype) and not is_string_dtype(dtype): raise ValueError( "Output dtype must be an integer type or a string. " f"Received: dtype={dtype}" diff --git a/keras_nlp/layers/preprocessing/random_swap.py b/keras_nlp/layers/preprocessing/random_swap.py index ad6f1c0980..27873f0fe8 100644 --- a/keras_nlp/layers/preprocessing/random_swap.py +++ b/keras_nlp/layers/preprocessing/random_swap.py @@ -21,7 +21,7 @@ PreprocessingLayer, ) from keras_nlp.utils.tensor_utils import convert_to_ragged_batch -from keras_nlp.utils.tensor_utils import is_integer_dtype +from keras_nlp.utils.tensor_utils import is_int_dtype from keras_nlp.utils.tensor_utils import is_string_dtype @@ -127,7 +127,7 @@ def __init__( dtype="int32", **kwargs, ): - if not is_integer_dtype(dtype) and not is_string_dtype(dtype): + if not is_int_dtype(dtype) and not is_string_dtype(dtype): raise ValueError( "Output dtype must be an integer type or a string. " f"Received: dtype={dtype}" diff --git a/keras_nlp/metrics/bleu.py b/keras_nlp/metrics/bleu.py index 51d224c676..750a1b704a 100644 --- a/keras_nlp/metrics/bleu.py +++ b/keras_nlp/metrics/bleu.py @@ -20,7 +20,7 @@ from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.backend import ops -from keras_nlp.utils.tensor_utils import is_floating_dtype +from keras_nlp.utils.tensor_utils import is_float_dtype from keras_nlp.utils.tensor_utils import tensor_to_list REPLACE_SUBSTRINGS = [ @@ -112,7 +112,7 @@ def __init__( ): super().__init__(name=name, dtype=dtype, **kwargs) - if not is_floating_dtype(dtype): + if not is_float_dtype(dtype): raise ValueError( "`dtype` must be a floating point type. " f"Received: dtype={dtype}" diff --git a/keras_nlp/metrics/edit_distance.py b/keras_nlp/metrics/edit_distance.py index 899f7f1f2e..263ff8290b 100644 --- a/keras_nlp/metrics/edit_distance.py +++ b/keras_nlp/metrics/edit_distance.py @@ -16,7 +16,7 @@ from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras -from keras_nlp.utils.tensor_utils import is_floating_dtype +from keras_nlp.utils.tensor_utils import is_float_dtype @keras_nlp_export("keras_nlp.metrics.EditDistance") @@ -87,7 +87,7 @@ def __init__( ): super().__init__(name=name, dtype=dtype, **kwargs) - if not is_floating_dtype(dtype): + if not is_float_dtype(dtype): raise ValueError( "`dtype` must be a floating point type. " f"Received: dtype={dtype}" diff --git a/keras_nlp/metrics/perplexity.py b/keras_nlp/metrics/perplexity.py index eb742fc31c..4a7e626bc9 100644 --- a/keras_nlp/metrics/perplexity.py +++ b/keras_nlp/metrics/perplexity.py @@ -15,7 +15,7 @@ from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.backend import ops -from keras_nlp.utils.tensor_utils import is_floating_dtype +from keras_nlp.utils.tensor_utils import is_float_dtype @keras_nlp_export("keras_nlp.metrics.Perplexity") @@ -88,7 +88,7 @@ def __init__( name="perplexity", **kwargs, ): - if not is_floating_dtype(dtype): + if not is_float_dtype(dtype): raise ValueError( "`dtype` must be a floating point type. " f"Received: dtype={dtype}" diff --git a/keras_nlp/metrics/rouge_base.py b/keras_nlp/metrics/rouge_base.py index f84e718080..824a6c4b5b 100644 --- a/keras_nlp/metrics/rouge_base.py +++ b/keras_nlp/metrics/rouge_base.py @@ -16,7 +16,7 @@ from keras_nlp.backend import keras from keras_nlp.backend import ops -from keras_nlp.utils.tensor_utils import is_floating_dtype +from keras_nlp.utils.tensor_utils import is_float_dtype from keras_nlp.utils.tensor_utils import tensor_to_list try: @@ -65,7 +65,7 @@ def __init__( "package. Please install it with `pip install rouge-score`." ) - if not is_floating_dtype(dtype): + if not is_float_dtype(dtype): raise ValueError( "`dtype` must be a floating point type. " f"Received: dtype={dtype}" diff --git a/keras_nlp/models/__init__.py b/keras_nlp/models/__init__.py index eb4e74be3a..ab04d8eae0 100644 --- a/keras_nlp/models/__init__.py +++ b/keras_nlp/models/__init__.py @@ -63,6 +63,8 @@ from keras_nlp.models.distil_bert.distil_bert_tokenizer import ( DistilBertTokenizer, ) +from keras_nlp.models.electra.electra_backbone import ElectraBackbone +from keras_nlp.models.electra.electra_tokenizer import ElectraTokenizer from keras_nlp.models.f_net.f_net_backbone import FNetBackbone from keras_nlp.models.f_net.f_net_classifier import FNetClassifier from keras_nlp.models.f_net.f_net_masked_lm import FNetMaskedLM @@ -87,6 +89,8 @@ GPTNeoXPreprocessor, ) from keras_nlp.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer +from keras_nlp.models.llama.llama_backbone import LlamaBackbone +from keras_nlp.models.mistral.mistral_backbone import MistralBackbone from keras_nlp.models.opt.opt_backbone import OPTBackbone from keras_nlp.models.opt.opt_causal_lm import OPTCausalLM from keras_nlp.models.opt.opt_causal_lm_preprocessor import ( diff --git a/keras_nlp/models/albert/albert_backbone.py b/keras_nlp/models/albert/albert_backbone.py index 51da1f49a7..414bb97e87 100644 --- a/keras_nlp/models/albert/albert_backbone.py +++ b/keras_nlp/models/albert/albert_backbone.py @@ -21,6 +21,7 @@ from keras_nlp.layers.modeling.transformer_encoder import TransformerEncoder from keras_nlp.models.albert.albert_presets import backbone_presets from keras_nlp.models.backbone import Backbone +from keras_nlp.utils.keras_utils import gelu_approximate from keras_nlp.utils.python_utils import classproperty @@ -180,9 +181,7 @@ def get_group_layer(group_idx): TransformerEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, - activation=lambda x: keras.activations.gelu( - x, approximate=True - ), + activation=gelu_approximate, dropout=dropout, layer_norm_epsilon=1e-12, kernel_initializer=albert_kernel_initializer(), diff --git a/keras_nlp/models/albert/albert_backbone_test.py b/keras_nlp/models/albert/albert_backbone_test.py index 5f93c5dc12..f1211e0fa3 100644 --- a/keras_nlp/models/albert/albert_backbone_test.py +++ b/keras_nlp/models/albert/albert_backbone_test.py @@ -12,66 +12,43 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.models.albert.albert_backbone import AlbertBackbone from keras_nlp.tests.test_case import TestCase class AlbertBackboneTest(TestCase): def setUp(self): - self.backbone = AlbertBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - num_groups=1, - num_inner_repetitions=1, - embedding_dim=16, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - self.batch_size = 8 - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "segment_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "num_groups": 1, + "num_inner_repetitions": 1, + "num_inner_repetitions": 1, + "embedding_dim": 16, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "segment_ids": ops.zeros((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_albert(self): - self.backbone(self.input_batch) - - def test_name(self): - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "albert_backbone") - - def test_variable_sequence_length_call_albert(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "segment_ids": np.ones((2, seq_length), dtype="int32"), - "padding_mask": np.ones((2, seq_length), dtype="int32"), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=AlbertBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape={ + "sequence_output": (2, 5, 2), + "pooled_output": (2, 2), + }, ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) def test_error_for_invalid_num_groups(self): with self.assertRaises(ValueError): @@ -88,47 +65,46 @@ def test_error_for_invalid_num_groups(self): @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, AlbertBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose( - model_output["pooled_output"], restored_output["pooled_output"] + self.run_model_saving_test( + cls=AlbertBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=AlbertBackbone, + preset="albert_base_en_uncased", + input_data={ + "token_ids": ops.array([[2, 13, 1, 3]], dtype="int32"), + "segment_ids": ops.zeros((1, 4), dtype="int32"), + "padding_mask": ops.ones((1, 4), dtype="int32"), + }, + expected_output_shape={ + "sequence_output": (1, 4, 768), + "pooled_output": (1, 768), + }, + # The forward pass from a preset should be stable! + expected_partial_output={ + "sequence_output": ( + ops.array( + [1.830863, 1.698645, -1.819195, -0.53382, -0.38114] + ) + ), + "pooled_output": ( + ops.array( + [0.328261, -0.415397, -0.388745, 0.156846, 0.657874] + ) + ), + }, + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class AlbertBackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = AlbertBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - num_groups=1, - num_inner_repetitions=1, - embedding_dim=16, - hidden_dim=2, - intermediate_dim=2, - max_sequence_length=4, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in AlbertBackbone.presets: + self.run_preset_test( + cls=AlbertBackbone, + preset=preset, + input_data=self.input_data, ) - - self.input_batch = { - "token_ids": np.ones((8, 128), dtype="int32"), - "segment_ids": np.ones((8, 128), dtype="int32"), - "padding_mask": np.ones((8, 128), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) diff --git a/keras_nlp/models/albert/albert_classifier_test.py b/keras_nlp/models/albert/albert_classifier_test.py index 71fef68ce1..5a60ff998b 100644 --- a/keras_nlp/models/albert/albert_classifier_test.py +++ b/keras_nlp/models/albert/albert_classifier_test.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KerasNLP Authors +# Copyright 2023 The KerasNLP Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,16 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os -import numpy as np import pytest -import sentencepiece -import tensorflow as tf -from keras_nlp.backend import keras -from keras_nlp.backend import ops from keras_nlp.models.albert.albert_backbone import AlbertBackbone from keras_nlp.models.albert.albert_classifier import AlbertClassifier from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor @@ -31,120 +25,59 @@ class AlbertClassifierTest(TestCase): def setUp(self): - # Setup model - - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=10, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - - tokenizer = AlbertTokenizer(proto=self.proto) - + # Setup model. self.preprocessor = AlbertPreprocessor( - tokenizer=tokenizer, - sequence_length=5, + AlbertTokenizer( + # Generated using create_albert_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ), + sequence_length=5, + ) ) self.backbone = AlbertBackbone( vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(), num_layers=2, num_heads=2, - embedding_dim=2, hidden_dim=2, + embedding_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - - self.classifier = AlbertClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - # Check we handle serialization correctly. - activation=keras.activations.softmax, + max_sequence_length=self.preprocessor.sequence_length, ) - - self.raw_batch = [ - "the quick brown fox.", - "the slow brown fox.", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - (self.raw_batch, np.ones((2,))) - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.classifier(self.preprocessed_batch) - - def test_classifier_predict(self): - preds1 = self.classifier.predict(self.raw_batch) - self.classifier.preprocessor = None - preds2 = self.classifier.predict(self.preprocessed_batch) - # Assert predictions match. - self.assertAllClose(preds1, preds2) - # Assert valid softmax output. - self.assertAllClose(ops.sum(preds2, axis=-1), [1.0, 1.0]) - - def test_classifier_fit(self): - self.classifier.fit(self.raw_dataset) - self.classifier.preprocessor = None - self.classifier.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.classifier.preprocessor = None - self.classifier.compile( - loss="sparse_categorical_crossentropy", - jit_compile=False, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + "num_classes": 2, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. + [1, 0], # Labels. ) - self.classifier.fit(self.preprocessed_dataset) - - def test_serialization(self): - # Defaults. - original = AlbertClassifier( - self.backbone, - num_classes=2, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) - # With options. - original = AlbertClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - activation=keras.activations.softmax, - name="test", - trainable=False, + self.input_data = self.preprocessor(*self.train_data)[0] + + def test_classifier_basics(self): + self.run_task_test( + cls=AlbertClassifier, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 2), ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) @pytest.mark.large - def test_saving_model(self): - model_output = self.classifier.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.classifier.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back - self.assertIsInstance(restored_model, AlbertClassifier) + def test_saved_model(self): + self.run_model_saving_test( + cls=AlbertClassifier, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in AlbertClassifier.presets: + self.run_preset_test( + cls=AlbertClassifier, + preset=preset, + init_kwargs={"num_classes": 2}, + input_data=self.input_data, + expected_output_shape=(2, 2), + ) diff --git a/keras_nlp/models/albert/albert_masked_lm.py b/keras_nlp/models/albert/albert_masked_lm.py index 423f196c0d..e95af7c207 100644 --- a/keras_nlp/models/albert/albert_masked_lm.py +++ b/keras_nlp/models/albert/albert_masked_lm.py @@ -24,6 +24,7 @@ ) from keras_nlp.models.albert.albert_presets import backbone_presets from keras_nlp.models.task import Task +from keras_nlp.utils.keras_utils import gelu_approximate from keras_nlp.utils.python_utils import classproperty @@ -107,9 +108,7 @@ def __init__(self, backbone, preprocessor=None, **kwargs): outputs = MaskedLMHead( vocabulary_size=backbone.vocabulary_size, token_embedding=backbone.token_embedding, - intermediate_activation=lambda x: keras.activations.gelu( - x, approximate=True - ), + intermediate_activation=gelu_approximate, kernel_initializer=albert_kernel_initializer(), name="mlm_head", )(backbone_outputs["sequence_output"], inputs["mask_positions"]) diff --git a/keras_nlp/models/albert/albert_masked_lm_preprocessor.py b/keras_nlp/models/albert/albert_masked_lm_preprocessor.py index 9a52d28a2c..89cf134465 100644 --- a/keras_nlp/models/albert/albert_masked_lm_preprocessor.py +++ b/keras_nlp/models/albert/albert_masked_lm_preprocessor.py @@ -131,18 +131,27 @@ def __init__( truncate=truncate, **kwargs, ) - + self.mask_selection_rate = mask_selection_rate + self.mask_selection_length = mask_selection_length + self.mask_token_rate = mask_token_rate + self.random_token_rate = random_token_rate + self.masker = None + + def build(self, input_shape): + super().build(input_shape) + # Defer masker creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.masker = MaskedLMMaskGenerator( - mask_selection_rate=mask_selection_rate, - mask_selection_length=mask_selection_length, - mask_token_rate=mask_token_rate, - random_token_rate=random_token_rate, - vocabulary_size=tokenizer.vocabulary_size(), - mask_token_id=tokenizer.mask_token_id, + mask_selection_rate=self.mask_selection_rate, + mask_selection_length=self.mask_selection_length, + mask_token_rate=self.mask_token_rate, + random_token_rate=self.random_token_rate, + vocabulary_size=self.tokenizer.vocabulary_size(), + mask_token_id=self.tokenizer.mask_token_id, unselectable_token_ids=[ - tokenizer.cls_token_id, - tokenizer.sep_token_id, - tokenizer.pad_token_id, + self.tokenizer.cls_token_id, + self.tokenizer.sep_token_id, + self.tokenizer.pad_token_id, ], ) @@ -150,10 +159,10 @@ def get_config(self): config = super().get_config() config.update( { - "mask_selection_rate": self.masker.mask_selection_rate, - "mask_selection_length": self.masker.mask_selection_length, - "mask_token_rate": self.masker.mask_token_rate, - "random_token_rate": self.masker.random_token_rate, + "mask_selection_rate": self.mask_selection_rate, + "mask_selection_length": self.mask_selection_length, + "mask_token_rate": self.mask_token_rate, + "random_token_rate": self.random_token_rate, } ) return config diff --git a/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py b/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py index d95b072108..79d3a36bbb 100644 --- a/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.albert.albert_masked_lm_preprocessor import ( AlbertMaskedLMPreprocessor, ) @@ -27,123 +25,67 @@ class AlbertMaskedLMPreprocessorTest(TestCase): def setUp(self): - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + self.tokenizer = AlbertTokenizer( + # Generated using create_albert_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ) ) - - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - - proto = bytes_io.getvalue() - - tokenizer = AlbertTokenizer(proto=proto) - - self.preprocessor = AlbertMaskedLMPreprocessor( - tokenizer=tokenizer, + self.init_kwargs = { + "tokenizer": self.tokenizer, # Simplify our testing by masking every available token. - mask_selection_rate=1.0, - mask_token_rate=1.0, - random_token_rate=0.0, - mask_selection_length=4, - sequence_length=12, + "mask_selection_rate": 1.0, + "mask_token_rate": 1.0, + "random_token_rate": 0.0, + "mask_selection_length": 4, + "sequence_length": 12, + } + self.input_data = ["the quick brown fox"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=AlbertMaskedLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0]], + "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[1, 2, 3, 4]], + }, + [[5, 10, 6, 8]], + [[1.0, 1.0, 1.0, 1.0]], + ), ) - def test_preprocess_strings(self): - input_data = "the quick brown fox" - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [1, 2, 3, 4]) - self.assertAllEqual(y, [5, 10, 6, 8]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0]) - - def test_preprocess_list_of_strings(self): - input_data = ["the quick brown fox"] * 4 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [[2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4]] * 4) - self.assertAllEqual(y, [[5, 10, 6, 8]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0]] * 4) - - def test_preprocess_dataset(self): - sentences = tf.constant(["the quick brown fox"] * 4) - ds = tf.data.Dataset.from_tensor_slices(sentences) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x["token_ids"], [[2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4]] * 4) - self.assertAllEqual(y, [[5, 10, 6, 8]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0]] * 4) - - def test_mask_multiple_sentences(self): - sentence_one = tf.constant("the quick") - sentence_two = tf.constant("brown fox") - - x, y, sw = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - x["token_ids"], [2, 4, 4, 3, 4, 4, 3, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [1, 2, 4, 5]) - self.assertAllEqual(y, [5, 10, 6, 8]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0]) - def test_no_masking_zero_rate(self): no_mask_preprocessor = AlbertMaskedLMPreprocessor( - self.preprocessor.tokenizer, + self.tokenizer, mask_selection_rate=0.0, mask_selection_length=4, sequence_length=12, ) - input_data = "the quick brown fox" - - x, y, sw = no_mask_preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [2, 5, 10, 6, 8, 3, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] + input_data = ["the quick brown fox"] + self.assertAllClose( + no_mask_preprocessor(input_data), + ( + { + "token_ids": [[2, 5, 10, 6, 8, 3, 0, 0, 0, 0, 0, 0]], + "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[0, 0, 0, 0]], + }, + [[0, 0, 0, 0]], + [[0.0, 0.0, 0.0, 0.0]], + ), ) - self.assertAllEqual(x["mask_positions"], [0, 0, 0, 0]) - self.assertAllEqual(y, [0, 0, 0, 0]) - self.assertAllEqual(sw, [0.0, 0.0, 0.0, 0.0]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in AlbertMaskedLMPreprocessor.presets: + self.run_preset_test( + cls=AlbertMaskedLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/albert/albert_masked_lm_test.py b/keras_nlp/models/albert/albert_masked_lm_test.py index 9d9ea2478d..49e6a595cd 100644 --- a/keras_nlp/models/albert/albert_masked_lm_test.py +++ b/keras_nlp/models/albert/albert_masked_lm_test.py @@ -12,14 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os import pytest -import sentencepiece -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.albert.albert_backbone import AlbertBackbone from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM from keras_nlp.models.albert.albert_masked_lm_preprocessor import ( @@ -32,33 +28,14 @@ class AlbertMaskedLMTest(TestCase): def setUp(self): # Setup model. - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round", "an eagle flew"] - ) - - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=15, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - - proto = bytes_io.getvalue() - - tokenizer = AlbertTokenizer(proto=proto) - self.preprocessor = AlbertMaskedLMPreprocessor( - tokenizer=tokenizer, + AlbertTokenizer( + # Generated using create_albert_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ), + sequence_length=5, + ), # Simplify our testing by masking every available token. mask_selection_rate=1.0, mask_token_rate=1.0, @@ -70,65 +47,41 @@ def setUp(self): vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(), num_layers=2, num_heads=2, - embedding_dim=4, - hidden_dim=4, + hidden_dim=2, + embedding_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.masked_lm = AlbertMaskedLM( - self.backbone, - preprocessor=self.preprocessor, + max_sequence_length=self.preprocessor.sequence_length, ) - self.masked_lm_no_preprocessing = AlbertMaskedLM( - self.backbone, - preprocessor=None, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. ) - - self.raw_batch = [ - "quick brown fox", - "eagle flew over fox", - "the eagle flew quick", - "a brown eagle", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch)[0] - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.masked_lm(self.preprocessed_batch) - - def test_albert_masked_lm_fit_default_compile(self): - self.masked_lm.fit(self.raw_dataset) - - def test_classifier_predict(self): - self.masked_lm.predict(self.raw_batch) - self.masked_lm.preprocessor = None - self.masked_lm.predict(self.preprocessed_batch) - - def test_classifier_fit(self): - self.masked_lm.fit(self.raw_dataset) - self.masked_lm.preprocessor = None - self.masked_lm.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.masked_lm.preprocessor = None - self.masked_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.input_data = self.preprocessor(*self.train_data)[0] + + def test_masked_lm_basics(self): + self.run_task_test( + cls=AlbertMaskedLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 5, 12), ) - self.masked_lm.fit(self.preprocessed_dataset) @pytest.mark.large def test_saved_model(self): - model_output = self.masked_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.masked_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) + self.run_model_saving_test( + cls=AlbertMaskedLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check we got the real object back. - self.assertIsInstance(restored_model, AlbertMaskedLM) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output, atol=0.01, rtol=0.01) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in AlbertMaskedLM.presets: + self.run_preset_test( + cls=AlbertMaskedLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/albert/albert_preprocessor.py b/keras_nlp/models/albert/albert_preprocessor.py index 4849aab392..5d5628a729 100644 --- a/keras_nlp/models/albert/albert_preprocessor.py +++ b/keras_nlp/models/albert/albert_preprocessor.py @@ -158,20 +158,28 @@ def __init__( ): super().__init__(**kwargs) self.tokenizer = tokenizer + self.truncate = truncate + self.sequence_length = sequence_length + self.packer = None + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = MultiSegmentPacker( start_value=self.tokenizer.cls_token_id, end_value=self.tokenizer.sep_token_id, pad_value=self.tokenizer.pad_token_id, - truncate=truncate, - sequence_length=sequence_length, + truncate=self.truncate, + sequence_length=self.sequence_length, ) + self.built = True def get_config(self): config = super().get_config() config.update( { - "sequence_length": self.packer.sequence_length, - "truncate": self.packer.truncate, + "sequence_length": self.sequence_length, + "truncate": self.truncate, } ) return config diff --git a/keras_nlp/models/albert/albert_preprocessor_test.py b/keras_nlp/models/albert/albert_preprocessor_test.py index 2830b85073..7d6fb4cfd4 100644 --- a/keras_nlp/models/albert/albert_preprocessor_test.py +++ b/keras_nlp/models/albert/albert_preprocessor_test.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer from keras_nlp.tests.test_case import TestCase @@ -25,139 +23,49 @@ class AlbertPreprocessorTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + self.tokenizer = AlbertTokenizer( + # Generated using create_albert_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ) + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ( + ["the quick brown fox"], + [1], # Pass through labels. + [1.0], # Pass through sample_weights. ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - - self.preprocessor = AlbertPreprocessor( - tokenizer=AlbertTokenizer(proto=self.proto), - sequence_length=12, - ) - - def test_tokenize_strings(self): - input_data = "the quick brown fox" - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], [2, 5, 10, 6, 8, 3, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - output["segment_ids"], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - output["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] - ) - - def test_tokenize_list_of_strings(self): - # We should handle a list of strings as as batch. - input_data = ["the quick brown fox"] * 4 - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], - [[2, 5, 10, 6, 8, 3, 0, 0, 0, 0, 0, 0]] * 4, - ) - self.assertAllEqual( - output["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - - def test_tokenize_labeled_batch(self): - x = tf.constant(["the quick brown fox"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - x_out, y_out, sw_out = self.preprocessor(x, y, sw) - self.assertAllEqual( - x_out["token_ids"], - [[2, 5, 10, 6, 8, 3, 0, 0, 0, 0, 0, 0]] * 4, - ) - self.assertAllEqual( - x_out["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - def test_tokenize_labeled_dataset(self): - x = tf.constant(["the quick brown fox"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - ds = tf.data.Dataset.from_tensor_slices((x, y, sw)) - ds = ds.map(self.preprocessor) - x_out, y_out, sw_out = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x_out["token_ids"], - [[2, 5, 10, 6, 8, 3, 0, 0, 0, 0, 0, 0]] * 4, - ) - self.assertAllEqual( - x_out["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_multiple_sentences(self): - sentence_one = tf.constant("the quick brown fox") - sentence_two = tf.constant("the earth") - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], - [2, 5, 10, 6, 8, 3, 5, 7, 3, 0, 0, 0], - ) - self.assertAllEqual( - output["segment_ids"], [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0] - ) - self.assertAllEqual( - output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0] - ) - - def test_tokenize_multiple_batched_sentences(self): - sentence_one = tf.constant(["the quick brown fox"] * 4) - sentence_two = tf.constant(["the earth"] * 4) - # The first tuple or list is always interpreted as an enumeration of - # separate sequences to concatenate. - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], - [[2, 5, 10, 6, 8, 3, 5, 7, 3, 0, 0, 0]] * 4, - ) - self.assertAllEqual( - output["segment_ids"], [[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]] * 4 + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=AlbertPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[2, 5, 10, 6, 8, 3, 0, 0]], + "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ), ) def test_errors_for_2d_list_input(self): + preprocessor = AlbertPreprocessor(**self.init_kwargs) ambiguous_input = [["one", "two"], ["three", "four"]] with self.assertRaises(ValueError): - self.preprocessor(ambiguous_input) + preprocessor(ambiguous_input) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in AlbertPreprocessor.presets: + self.run_preset_test( + cls=AlbertPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/albert/albert_presets.py b/keras_nlp/models/albert/albert_presets.py index 34126f52cd..3cd8215295 100644 --- a/keras_nlp/models/albert/albert_presets.py +++ b/keras_nlp/models/albert/albert_presets.py @@ -26,24 +26,7 @@ "path": "albert", "model_card": "https://github.com/google-research/albert/blob/master/README.md", }, - "config": { - "vocabulary_size": 30000, - "num_layers": 12, - "num_heads": 12, - "num_groups": 1, - "num_inner_repetitions": 1, - "embedding_dim": 128, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.0, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/albert_base_en_uncased/v1/model.h5", - "weights_hash": "b83ccf3418dd84adc569324183176813", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/albert_base_en_uncased/v1/vocab.spm", - "spm_proto_hash": "73e62ff8e90f951f24c8b907913039a5", + "kaggle_handle": "kaggle://keras/albert/keras/albert_base_en_uncased/2", }, "albert_large_en_uncased": { "metadata": { @@ -56,24 +39,7 @@ "path": "albert", "model_card": "https://github.com/google-research/albert/blob/master/README.md", }, - "config": { - "vocabulary_size": 30000, - "num_layers": 24, - "num_heads": 16, - "num_groups": 1, - "num_inner_repetitions": 1, - "embedding_dim": 128, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "dropout": 0, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/albert_large_en_uncased/v1/model.h5", - "weights_hash": "c7754804efb245f06dd6e7ced32e082c", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/albert_large_en_uncased/v1/vocab.spm", - "spm_proto_hash": "73e62ff8e90f951f24c8b907913039a5", + "kaggle_handle": "kaggle://keras/albert/keras/albert_large_en_uncased/2", }, "albert_extra_large_en_uncased": { "metadata": { @@ -86,24 +52,7 @@ "path": "albert", "model_card": "https://github.com/google-research/albert/blob/master/README.md", }, - "config": { - "vocabulary_size": 30000, - "num_layers": 24, - "num_heads": 16, - "num_groups": 1, - "num_inner_repetitions": 1, - "embedding_dim": 128, - "hidden_dim": 2048, - "intermediate_dim": 8192, - "dropout": 0, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/albert_extra_large_en_uncased/v1/model.h5", - "weights_hash": "713209be8aadfa614fd79f18c9aeb16d", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/albert_extra_large_en_uncased/v1/vocab.spm", - "spm_proto_hash": "73e62ff8e90f951f24c8b907913039a5", + "kaggle_handle": "kaggle://keras/albert/keras/albert_extra_large_en_uncased/2", }, "albert_extra_extra_large_en_uncased": { "metadata": { @@ -116,23 +65,6 @@ "path": "albert", "model_card": "https://github.com/google-research/albert/blob/master/README.md", }, - "config": { - "vocabulary_size": 30000, - "num_layers": 12, - "num_heads": 64, - "num_groups": 1, - "num_inner_repetitions": 1, - "embedding_dim": 128, - "hidden_dim": 4096, - "intermediate_dim": 16384, - "dropout": 0, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/albert_extra_extra_large_en_uncased/v1/model.h5", - "weights_hash": "a835177b692fb6a82139f94c66db2f22", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/albert_extra_extra_large_en_uncased/v1/vocab.spm", - "spm_proto_hash": "73e62ff8e90f951f24c8b907913039a5", + "kaggle_handle": "kaggle://keras/albert/keras/albert_extra_extra_large_en_uncased/2", }, } diff --git a/keras_nlp/models/albert/albert_presets_test.py b/keras_nlp/models/albert/albert_presets_test.py deleted file mode 100644 index 86265a1abd..0000000000 --- a/keras_nlp/models/albert/albert_presets_test.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from absl.testing import parameterized - -from keras_nlp.backend import ops -from keras_nlp.models.albert.albert_backbone import AlbertBackbone -from keras_nlp.models.albert.albert_classifier import AlbertClassifier -from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor -from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.large -class AlbertPresetSmokeTest(TestCase): - """ - A smoke test for ALBERT presets we run continuously. - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/albert/albert_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = AlbertTokenizer.from_preset( - "albert_base_en_uncased", - ) - outputs = tokenizer("The quick brown fox.") - expected_outputs = [13, 1, 438, 2231, 886, 2385, 9] - self.assertAllEqual(outputs, expected_outputs) - - def test_preprocessor_output(self): - preprocessor = AlbertPreprocessor.from_preset( - "albert_base_en_uncased", - sequence_length=4, - ) - outputs = preprocessor("The quick brown fox.")["token_ids"] - expected_outputs = [2, 13, 1, 3] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_classifier_output(self, load_weights): - input_data = ["The quick brown fox."] - model = AlbertClassifier.from_preset( - "albert_base_en_uncased", - num_classes=2, - load_weights=load_weights, - ) - # We don't assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_classifier_output_without_preprocessing(self, load_weights): - input_data = { - "token_ids": ops.array([[101, 1996, 4248, 102]]), - "segment_ids": ops.array([[0, 0, 0, 0]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = AlbertClassifier.from_preset( - "albert_base_en_uncased", - num_classes=2, - load_weights=load_weights, - preprocessor=None, - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "token_ids": ops.array([[2, 13, 1, 3]]), - "segment_ids": ops.array([[0, 0, 0, 0]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = AlbertBackbone.from_preset( - "albert_base_en_uncased", load_weights=load_weights - ) - outputs = model(input_data) - if load_weights: - outputs = outputs["sequence_output"][0, 0, :5] - expected = [1.830863, 1.698645, -1.819195, -0.53382, -0.38114] - self.assertAllClose(outputs, expected, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("albert_tokenizer", AlbertTokenizer), - ("albert_preprocessor", AlbertPreprocessor), - ("albert", AlbertBackbone), - ("albert_classifier", AlbertClassifier), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("albert_tokenizer", AlbertTokenizer, {}), - ("albert_preprocessor", AlbertPreprocessor, {}), - ("albert", AlbertBackbone, {}), - ("albert_classifier", AlbertClassifier, {"num_classes": 2}), - ) - def test_unknown_preset_error(self, cls, kwargs): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("albert_base_en_uncased_clowntown", **kwargs) - - -@pytest.mark.extra_large -class AlbertPresetFullTest(TestCase): - """ - Test the full enumeration of our preset. - This tests every ALBERT preset and is only run manually. - Run with: - `pytest keras_nlp/models/albert/albert_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_albert(self, load_weights): - for preset in AlbertBackbone.presets: - model = AlbertBackbone.from_preset( - preset, load_weights=load_weights - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), dtype="int64", maxval=model.vocabulary_size - ), - "segment_ids": ops.array([0] * 200 + [1] * 312, shape=(1, 512)), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - model(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_load_albert_classifier(self, load_weights): - for preset in AlbertClassifier.presets: - classifier = AlbertClassifier.from_preset( - preset, - num_classes=2, - load_weights=load_weights, - ) - input_data = ["This quick brown fox."] - classifier.predict(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_load_albert_classifier_without_preprocessing(self, load_weights): - for preset in AlbertClassifier.presets: - classifier = AlbertClassifier.from_preset( - preset, - num_classes=2, - preprocessor=None, - load_weights=load_weights, - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), - dtype="int64", - maxval=classifier.backbone.vocabulary_size, - ), - "segment_ids": ops.array([0] * 200 + [1] * 312, shape=(1, 512)), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - classifier.predict(input_data) - - def test_load_tokenizers(self): - for preset in AlbertTokenizer.presets: - tokenizer = AlbertTokenizer.from_preset(preset) - tokenizer("The quick brown fox.") - - def test_load_preprocessors(self): - for preset in AlbertPreprocessor.presets: - preprocessor = AlbertPreprocessor.from_preset(preset) - preprocessor("The quick brown fox.") diff --git a/keras_nlp/models/albert/albert_tokenizer.py b/keras_nlp/models/albert/albert_tokenizer.py index 1b85be3a99..44aed44cf5 100644 --- a/keras_nlp/models/albert/albert_tokenizer.py +++ b/keras_nlp/models/albert/albert_tokenizer.py @@ -87,25 +87,38 @@ class AlbertTokenizer(SentencePieceTokenizer): """ def __init__(self, proto, **kwargs): + self.cls_token = "[CLS]" + self.sep_token = "[SEP]" + self.pad_token = "" + self.mask_token = "[MASK]" + super().__init__(proto=proto, **kwargs) - # Check for necessary special tokens. - cls_token = "[CLS]" - sep_token = "[SEP]" - pad_token = "" - mask_token = "[MASK]" - for token in [cls_token, sep_token, pad_token, mask_token]: - if token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{token}'` in the provided " - f"`vocabulary`. Please provide `'{token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.cls_token_id = self.token_to_id(cls_token) - self.sep_token_id = self.token_to_id(sep_token) - self.pad_token_id = self.token_to_id(pad_token) - self.mask_token_id = self.token_to_id(mask_token) + def set_proto(self, proto): + super().set_proto(proto) + if proto is not None: + for token in [ + self.cls_token, + self.sep_token, + self.pad_token, + self.mask_token, + ]: + if token not in self.get_vocabulary(): + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.cls_token_id = self.token_to_id(self.cls_token) + self.sep_token_id = self.token_to_id(self.sep_token) + self.pad_token_id = self.token_to_id(self.pad_token) + self.mask_token_id = self.token_to_id(self.mask_token) + else: + self.cls_token_id = None + self.sep_token_id = None + self.pad_token_id = None + self.mask_token_id = None @classproperty def presets(cls): diff --git a/keras_nlp/models/albert/albert_tokenizer_test.py b/keras_nlp/models/albert/albert_tokenizer_test.py index fb7c145518..74ad0604dc 100644 --- a/keras_nlp/models/albert/albert_tokenizer_test.py +++ b/keras_nlp/models/albert/albert_tokenizer_test.py @@ -12,77 +12,55 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer from keras_nlp.tests.test_case import TestCase class AlbertTokenizerTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - - self.tokenizer = AlbertTokenizer(proto=self.proto) - - def test_tokenize(self): - input_data = "the quick brown fox" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [5, 10, 6, 8]) - - def test_tokenize_batch(self): - input_data = ["the quick brown fox", "the earth is round"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[5, 10, 6, 8], [5, 7, 9, 11]]) + self.init_kwargs = { + # Generated using create_albert_test_proto.py + "proto": os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ) + } + self.input_data = ["the quick brown fox", "the earth is round"] - def test_detokenize(self): - input_data = [[5, 10, 6, 8]] - output = self.tokenizer.detokenize(input_data) - self.assertEqual(output, ["the quick brown fox"]) - - def test_vocabulary_size(self): - tokenizer = AlbertTokenizer(proto=self.proto) - self.assertEqual(tokenizer.vocabulary_size(), 12) + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=AlbertTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]], + ) def test_errors_missing_special_tokens(self): - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(["abc"]), - model_writer=bytes_io, - vocab_size=5, - pad_id=-1, - eos_id=-1, - bos_id=-1, - ) with self.assertRaises(ValueError): - AlbertTokenizer(proto=bytes_io.getvalue()) + AlbertTokenizer( + # Generated using create_no_special_token_proto.py + proto=os.path.join( + self.get_test_data_dir(), "no_special_token_vocab.spm" + ) + ) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=AlbertTokenizer, + preset="albert_base_en_uncased", + input_data=["The quick brown fox."], + expected_output=[[13, 1, 438, 2231, 886, 2385, 9]], ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in AlbertTokenizer.presets: + self.run_preset_test( + cls=AlbertTokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/backbone.py b/keras_nlp/models/backbone.py index b7a7ba2119..69da56593b 100644 --- a/keras_nlp/models/backbone.py +++ b/keras_nlp/models/backbone.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - from keras_nlp.backend import keras +from keras_nlp.utils.preset_utils import check_preset_class +from keras_nlp.utils.preset_utils import load_from_preset from keras_nlp.utils.python_utils import classproperty from keras_nlp.utils.python_utils import format_docstring @@ -24,6 +24,19 @@ class Backbone(keras.Model): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._token_embedding = None + self._functional_layer_ids = set( + id(layer) for layer in self._flatten_layers() + ) + + def __dir__(self): + # Temporary fixes for weight saving. This mimics the following PR for + # older version of Keras: https://github.com/keras-team/keras/pull/18982 + def filter_fn(attr): + if attr == "_layer_checkpoint_dependencies": + return False + return id(getattr(self, attr)) not in self._functional_layer_ids + + return filter(filter_fn, super().__dir__()) def __setattr__(self, name, value): # Work around torch setattr for properties. @@ -94,32 +107,17 @@ def from_preset( ) ``` """ - - if not cls.presets: - raise NotImplementedError( - "No presets have been created for this class." - ) - - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - config = metadata["config"] - model = cls.from_config({**config, **kwargs}) - - if not load_weights: - return model - - weights = keras.utils.get_file( - "model.h5", - metadata["weights_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["weights_hash"], + # We support short IDs for official presets, e.g. `"bert_base_en"`. + # Map these to a Kaggle Models handle. + if preset in cls.presets: + preset = cls.presets[preset]["kaggle_handle"] + + check_preset_class(preset, cls) + return load_from_preset( + preset, + load_weights=load_weights, + config_overrides=kwargs, ) - model.load_weights(weights) - return model def __init_subclass__(cls, **kwargs): # Use __init_subclass__ to setup a correct docstring for from_preset. diff --git a/keras_nlp/models/bart/bart_backbone.py b/keras_nlp/models/bart/bart_backbone.py index 203d3ab2d7..2679b84a9f 100644 --- a/keras_nlp/models/bart/bart_backbone.py +++ b/keras_nlp/models/bart/bart_backbone.py @@ -157,9 +157,7 @@ def __init__( x = TransformerEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, - activation=lambda x: keras.activations.gelu( - x, approximate=False - ), + activation=keras.activations.gelu, dropout=dropout, layer_norm_epsilon=1e-5, kernel_initializer=bart_kernel_initializer(), @@ -200,9 +198,7 @@ def __init__( intermediate_dim=intermediate_dim, num_heads=num_heads, dropout=dropout, - activation=lambda x: keras.activations.gelu( - x, approximate=False - ), + activation=keras.activations.gelu, layer_norm_epsilon=1e-5, kernel_initializer=bart_kernel_initializer(), name=f"transformer_decoder_layer_{i}", diff --git a/keras_nlp/models/bart/bart_backbone_test.py b/keras_nlp/models/bart/bart_backbone_test.py index feb643a4de..fe4b6af52a 100644 --- a/keras_nlp/models/bart/bart_backbone_test.py +++ b/keras_nlp/models/bart/bart_backbone_test.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KerasNLP Authors +# Copyright 2023 The KerasNLP Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,110 +12,80 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.models.bart.bart_backbone import BartBackbone from keras_nlp.tests.test_case import TestCase class BartBackboneTest(TestCase): def setUp(self): - self.backbone = BartBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=3, - intermediate_dim=4, - max_sequence_length=5, - ) - self.input_batch = { - "encoder_token_ids": np.ones((2, 5), dtype="int32"), - "encoder_padding_mask": np.ones((2, 5), dtype="int32"), - "decoder_token_ids": np.ones((2, 5), dtype="int32"), - "decoder_padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "encoder_token_ids": ops.ones((2, 3), dtype="int32"), + "encoder_padding_mask": ops.zeros((2, 3), dtype="int32"), + "decoder_token_ids": ops.ones((2, 5), dtype="int32"), + "decoder_padding_mask": ops.zeros((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call(self): - self.backbone(self.input_batch) - - def test_name(self): - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "bart_backbone") - - def test_variable_sequence_length_call(self): - for seq_length in (2, 3, 4): - input_data = { - "encoder_token_ids": np.ones((2, seq_length), dtype="int32"), - "encoder_padding_mask": np.ones((2, seq_length), dtype="int32"), - "decoder_token_ids": np.ones((2, seq_length), dtype="int32"), - "decoder_padding_mask": np.ones((2, seq_length), dtype="int32"), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=BartBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape={ + "encoder_sequence_output": (2, 3, 2), + "decoder_sequence_output": (2, 5, 2), + }, ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, BartBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose( - model_output["encoder_sequence_output"], - restored_output["encoder_sequence_output"], - ) - self.assertAllClose( - model_output["decoder_sequence_output"], - restored_output["decoder_sequence_output"], + self.run_model_saving_test( + cls=BartBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=BartBackbone, + preset="bart_base_en", + input_data={ + "encoder_token_ids": ops.array([[0, 133, 2119, 2]]), + "encoder_padding_mask": ops.array([[1, 1, 1, 1]]), + "decoder_token_ids": ops.array([[0, 7199, 14, 2119, 2]]), + "decoder_padding_mask": ops.array([[1, 1, 1, 1, 1]]), + }, + expected_output_shape={ + "encoder_sequence_output": (1, 4, 768), + "decoder_sequence_output": (1, 5, 768), + }, + # The forward pass from a preset should be stable! + expected_partial_output={ + "encoder_sequence_output": ops.array( + [-0.033, 0.013, -0.003, -0.012, -0.002] + ), + "decoder_sequence_output": ops.array( + [2.516, 2.489, 0.695, 8.057, 1.245] + ), + }, + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class BartBackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = BartBackbone( - vocabulary_size=1000, - num_layers=2, - num_heads=2, - hidden_dim=64, - intermediate_dim=128, - max_sequence_length=128, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BartBackbone.presets: + self.run_preset_test( + cls=BartBackbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "encoder_token_ids": np.ones((8, 128), dtype="int32"), - "encoder_padding_mask": np.ones((8, 128), dtype="int32"), - "decoder_token_ids": np.ones((8, 128), dtype="int32"), - "decoder_padding_mask": np.ones((8, 128), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) diff --git a/keras_nlp/models/bart/bart_preprocessor.py b/keras_nlp/models/bart/bart_preprocessor.py index ebe0310b69..ffe2148839 100644 --- a/keras_nlp/models/bart/bart_preprocessor.py +++ b/keras_nlp/models/bart/bart_preprocessor.py @@ -140,15 +140,23 @@ def __init__( ): super().__init__(**kwargs) self.tokenizer = tokenizer + self.encoder_sequence_length = encoder_sequence_length + self.decoder_sequence_length = decoder_sequence_length + self.encoder_packer = None + self.decoder_packer = None + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. # TODO: Use `MultiSegmentPacker` instead of `StartEndPacker` once we # want to move to multi-segment packing and have improved # `MultiSegmentPacker`'s performance. self.encoder_packer = StartEndPacker( - start_value=tokenizer.start_token_id, - end_value=tokenizer.end_token_id, - pad_value=tokenizer.pad_token_id, - sequence_length=encoder_sequence_length, + start_value=self.tokenizer.start_token_id, + end_value=self.tokenizer.end_token_id, + pad_value=self.tokenizer.pad_token_id, + sequence_length=self.encoder_sequence_length, return_padding_mask=True, ) @@ -161,19 +169,10 @@ def __init__( ], end_value=self.tokenizer.end_token_id, pad_value=self.tokenizer.pad_token_id, - sequence_length=decoder_sequence_length, + sequence_length=self.decoder_sequence_length, return_padding_mask=True, ) - - def get_config(self): - config = super().get_config() - config.update( - { - "encoder_sequence_length": self.encoder_packer.sequence_length, - "decoder_sequence_length": self.decoder_packer.sequence_length, - } - ) - return config + self.built = True def call(self, x, y=None, sample_weight=None): if not ( @@ -217,6 +216,16 @@ def call(self, x, y=None, sample_weight=None): return pack_x_y_sample_weight(x, y, sample_weight) + def get_config(self): + config = super().get_config() + config.update( + { + "encoder_sequence_length": self.encoder_sequence_length, + "decoder_sequence_length": self.decoder_sequence_length, + } + ) + return config + @classproperty def tokenizer_cls(cls): return BartTokenizer @@ -224,63 +233,3 @@ def tokenizer_cls(cls): @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - def from_preset( - cls, - preset, - **kwargs, - ): - # Override base class's `from_preset` to handle `encoder_sequence_length` - # and `decoder_sequence_length`. - if not cls.presets: - raise NotImplementedError( - "No presets have been created for this class." - ) - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - - tokenizer = cls.tokenizer_cls.from_preset(preset) - - metadata = cls.presets[preset] - # For task model presets, the backbone config is nested. - if "backbone" in metadata["config"]: - backbone_config = metadata["config"]["backbone"]["config"] - else: - backbone_config = metadata["config"] - - # Use model's `max_sequence_length` if either `encoder_sequence_length` - # or `decoder_sequence_length` are unspecified; otherwise check that - # `encoder_sequence_length`/`decoder_sequence_length` are not too long. - encoder_sequence_length = kwargs.pop("encoder_sequence_length", None) - decoder_sequence_length = kwargs.pop("decoder_sequence_length", None) - max_sequence_length = backbone_config["max_sequence_length"] - - def check_sequence_length(sequence_length, name): - if sequence_length is not None: - if sequence_length > max_sequence_length: - raise ValueError( - f"`{name}` cannot be longer than `{preset}` " - f"preset's `max_sequence_length` of {max_sequence_length}. " - f"Received: {sequence_length}." - ) - return sequence_length - else: - return max_sequence_length - - encoder_sequence_length = check_sequence_length( - encoder_sequence_length, "encoder_sequence_length" - ) - decoder_sequence_length = check_sequence_length( - decoder_sequence_length, "decoder_sequence_length" - ) - - return cls( - tokenizer=tokenizer, - encoder_sequence_length=encoder_sequence_length, - decoder_sequence_length=decoder_sequence_length, - **kwargs, - ) diff --git a/keras_nlp/models/bart/bart_preprocessor_test.py b/keras_nlp/models/bart/bart_preprocessor_test.py index 189de29ed3..23cb7cae79 100644 --- a/keras_nlp/models/bart/bart_preprocessor_test.py +++ b/keras_nlp/models/bart/bart_preprocessor_test.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.bart.bart_preprocessor import BartPreprocessor from keras_nlp.models.bart.bart_tokenizer import BartTokenizer from keras_nlp.tests.test_case import TestCase @@ -22,141 +22,48 @@ class BartPreprocessorTest(TestCase): def setUp(self): - vocab = { - "": 0, - "": 1, - "": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, - "Ġkoh": 7, - "li": 8, - "Ġis": 9, - "Ġthe": 10, - "Ġbest": 11, - "": 12, + self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] + self.vocab += ["port", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.tokenizer = BartTokenizer( + vocabulary=self.vocab, merges=self.merges + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "encoder_sequence_length": 5, + "decoder_sequence_length": 8, } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - - self.preprocessor = BartPreprocessor( - tokenizer=BartTokenizer( - vocabulary=vocab, - merges=merges, - ), - encoder_sequence_length=10, - decoder_sequence_length=9, - ) - - def test_tokenize_strings(self): - input_data = { - "encoder_text": " airplane at airport", - "decoder_text": " kohli is the best", - } - - output = self.preprocessor(input_data) - self.assertAllEqual( - output["encoder_token_ids"], [0, 3, 4, 5, 3, 6, 2, 1, 1, 1] - ) - self.assertAllEqual( - output["encoder_padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0] - ) - self.assertAllEqual( - output["decoder_token_ids"], [2, 0, 7, 8, 9, 10, 11, 2, 1] - ) - self.assertAllEqual( - output["decoder_padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 0] - ) - - def test_key_order(self): - self.assertAllClose( - self.preprocessor( - { - "encoder_text": " airplane at airport", - "decoder_text": " kohli is the best", - } - ), - self.preprocessor( + self.input_data = ( + { + "encoder_text": [" airplane at airport"], + "decoder_text": [" airplane airport"], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ) + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=BartPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( { - "decoder_text": " kohli is the best", - "encoder_text": " airplane at airport", - } + "encoder_token_ids": [[0, 4, 5, 6, 2]], + "encoder_padding_mask": [[1, 1, 1, 1, 1]], + "decoder_token_ids": [[2, 0, 4, 5, 4, 7, 2, 1]], + "decoder_padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. ), ) - def test_tokenize_list_of_strings(self): - input_data = { - "encoder_text": [" airplane at airport"] * 4, - "decoder_text": [" kohli is the best"] * 4, - } - - output = self.preprocessor(input_data) - self.assertAllEqual( - output["encoder_token_ids"], [[0, 3, 4, 5, 3, 6, 2, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - output["encoder_padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - output["decoder_token_ids"], [[2, 0, 7, 8, 9, 10, 11, 2, 1]] * 4 - ) - self.assertAllEqual( - output["decoder_padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - - def test_tokenize_labeled_batch(self): - x = { - "encoder_text": [" airplane at airport"] * 4, - "decoder_text": [" kohli is the best"] * 4, - } - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - x_out, y_out, sw_out = self.preprocessor(x, y, sw) - self.assertAllEqual( - x_out["encoder_token_ids"], [[0, 3, 4, 5, 3, 6, 2, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - x_out["encoder_padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x_out["decoder_token_ids"], [[2, 0, 7, 8, 9, 10, 11, 2, 1]] * 4 - ) - self.assertAllEqual( - x_out["decoder_padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_labeled_dataset(self): - x = { - "encoder_text": [" airplane at airport"] * 4, - "decoder_text": [" kohli is the best"] * 4, - } - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - ds = tf.data.Dataset.from_tensor_slices((x, y, sw)) - ds = ds.map(self.preprocessor) - x_out, y_out, sw_out = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x_out["encoder_token_ids"], [[0, 3, 4, 5, 3, 6, 2, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - x_out["encoder_padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x_out["decoder_token_ids"], [[2, 0, 7, 8, 9, 10, 11, 2, 1]] * 4 - ) - self.assertAllEqual( - x_out["decoder_padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - def test_error_multi_segment_input(self): + preprocessor = BartPreprocessor(**self.init_kwargs) input_data = { "encoder_text": ( tf.constant([" airplane at airport"] * 2), @@ -167,14 +74,14 @@ def test_error_multi_segment_input(self): tf.constant([" kohli"] * 2), ), } - with self.assertRaises(ValueError): - self.preprocessor(input_data) - - def test_serialization(self): - new_preprocessor = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.preprocessor) - ) - self.assertEqual( - new_preprocessor.get_config(), self.preprocessor.get_config() - ) + preprocessor(input_data) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BartPreprocessor.presets: + self.run_preset_test( + cls=BartPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/bart/bart_presets.py b/keras_nlp/models/bart/bart_presets.py index aa06254c10..cca8d54959 100644 --- a/keras_nlp/models/bart/bart_presets.py +++ b/keras_nlp/models/bart/bart_presets.py @@ -25,22 +25,7 @@ "path": "bart", "model_card": "https://github.com/facebookresearch/fairseq/blob/main/examples/bart/README.md", }, - "config": { - "vocabulary_size": 50265, - "num_layers": 6, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 1024, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bart_base_en/v1/model.h5", - "weights_hash": "5b59403f0cafafbd89680e0785791163", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bart_base_en/v1/vocab.json", - "vocabulary_hash": "be4d3c6f3f5495426b2c03b334334354", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/bart_base_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/bart/keras/bart_base_en/2", }, "bart_large_en": { "metadata": { @@ -62,13 +47,7 @@ "dropout": 0.1, "max_sequence_length": 1024, }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bart_large_en/v1/model.h5", - "weights_hash": "6bfe7e591af8c5699ce6f9f18753af9a", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bart_large_en/v1/vocab.json", - "vocabulary_hash": "cf410ee085c5c69c957bb1f6d8456596", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/bart_large_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/bart/keras/bart_large_en/2", }, "bart_large_en_cnn": { "metadata": { @@ -90,12 +69,6 @@ "dropout": 0.1, "max_sequence_length": 1024, }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bart_large_en_cnn/v1/model.h5", - "weights_hash": "99782ecd9365956f016096fef9afd62c", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bart_large_en_cnn/v1/vocab.json", - "vocabulary_hash": "be4d3c6f3f5495426b2c03b334334354", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/bart_large_en_cnn/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/bart/keras/bart_large_en_cnn/2", }, } diff --git a/keras_nlp/models/bart/bart_presets_test.py b/keras_nlp/models/bart/bart_presets_test.py deleted file mode 100644 index 89518d845b..0000000000 --- a/keras_nlp/models/bart/bart_presets_test.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Copyright 2023 The KerasNLP Authors -# - -from keras_nlp.backend import ops -from keras_nlp.tests.test_case import TestCase - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for loading pretrained model presets.""" - -import pytest -from absl.testing import parameterized - -from keras_nlp.models.bart.bart_backbone import BartBackbone -from keras_nlp.models.bart.bart_tokenizer import BartTokenizer - - -@pytest.mark.large -class BartPresetSmokeTest(TestCase): - """ - A smoke test for BART presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/bart/bart_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = BartTokenizer.from_preset( - "bart_base_en", - ) - outputs = tokenizer("The quick brown fox.") - expected_outputs = [133, 2119, 6219, 23602, 4] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "encoder_token_ids": ops.array([[0, 133, 2119, 2]]), - "encoder_padding_mask": ops.array([[1, 1, 1, 1]]), - "decoder_token_ids": ops.array([[0, 7199, 14, 2119, 2]]), - "decoder_padding_mask": ops.array([[1, 1, 1, 1, 1]]), - } - model = BartBackbone.from_preset( - "bart_base_en", load_weights=load_weights - ) - outputs = model(input_data) - if load_weights: - encoder_output = outputs["encoder_sequence_output"][0, 0, :5] - expected_encoder_output = [-0.033, 0.013, -0.003, -0.012, -0.002] - decoder_output = outputs["decoder_sequence_output"][0, 0, :5] - expected_decoder_output = [2.516, 2.489, 0.695, 8.057, 1.245] - - self.assertAllClose( - encoder_output, expected_encoder_output, atol=0.01, rtol=0.01 - ) - self.assertAllClose( - decoder_output, expected_decoder_output, atol=0.01, rtol=0.01 - ) - - @parameterized.named_parameters( - ("bart_tokenizer", BartTokenizer), - ("bart", BartBackbone), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("bart_tokenizer", BartTokenizer), - ("bart", BartBackbone), - ) - def test_unknown_preset_error(self, cls): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("bart_base_en_clowntown") - - -@pytest.mark.extra_large -class BartPresetFullTest(TestCase): - """ - Test the full enumeration of our preset. - - This tests every BART preset and is only run manually. - Run with: - `pytest keras_nlp/models/bart/bart_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_bart(self, load_weights): - for preset in BartBackbone.presets: - model = BartBackbone.from_preset(preset, load_weights=load_weights) - input_data = { - "encoder_token_ids": ops.random.uniform( - shape=(1, 1024), - dtype="int64", - maxval=model.vocabulary_size, - ), - "encoder_padding_mask": ops.array( - [1] * 768 + [0] * 256, shape=(1, 1024) - ), - "decoder_token_ids": ops.random.uniform( - shape=(1, 1024), - dtype="int64", - maxval=model.vocabulary_size, - ), - "decoder_padding_mask": ops.array( - [1] * 489 + [0] * 535, shape=(1, 1024) - ), - } - model(input_data) - - def test_load_tokenizers(self): - for preset in BartTokenizer.presets: - tokenizer = BartTokenizer.from_preset(preset) - tokenizer("The quick brown fox.") diff --git a/keras_nlp/models/bart/bart_seq_2_seq_lm_preprocessor.py b/keras_nlp/models/bart/bart_seq_2_seq_lm_preprocessor.py index e238b668e9..3d398d29d1 100644 --- a/keras_nlp/models/bart/bart_seq_2_seq_lm_preprocessor.py +++ b/keras_nlp/models/bart/bart_seq_2_seq_lm_preprocessor.py @@ -46,16 +46,6 @@ class BartSeq2SeqLMPreprocessor(BartPreprocessor): tokenizer: A `keras_nlp.models.BartTokenizer` instance. encoder_sequence_length: The length of the packed encoder inputs. decoder_sequence_length: The length of the packed decoder inputs. - truncate: string. The algorithm to truncate a list of batched segments - to fit within `sequence_length`. The value can be either - `round_robin` or `waterfall`: - - `"round_robin"`: Available space is assigned one token at a - time in a round-robin fashion to the inputs that still need - some, until the limit is reached. - - `"waterfall"`: The allocation of the budget is done using a - "waterfall" algorithm that allocates quota in a - left-to-right manner and fills up the buckets until we run - out of budget. It supports an arbitrary number of segments. Call arguments: x: A dictionary with `encoder_text` and `decoder_text` as its keys. @@ -137,9 +127,8 @@ class BartSeq2SeqLMPreprocessor(BartPreprocessor): def __init__( self, tokenizer, - encoder_sequence_length, - decoder_sequence_length, - truncate="round_robin", + encoder_sequence_length=1024, + decoder_sequence_length=1024, **kwargs ): # Since we truncate the last token from `decoder_token_ids`, we need to @@ -156,16 +145,6 @@ def __init__( self._encoder_sequence_length = encoder_sequence_length self._decoder_sequence_length = decoder_sequence_length - def get_config(self): - config = super().get_config() - config.update( - { - "encoder_sequence_length": self._encoder_sequence_length, - "decoder_sequence_length": self._decoder_sequence_length, - } - ) - return config - def call(self, x, y=None, sample_weight=None): if y is not None or sample_weight is not None: logging.warning( @@ -191,10 +170,6 @@ def call(self, x, y=None, sample_weight=None): sample_weight = decoder_padding_mask[..., 1:] return pack_x_y_sample_weight(x, y, sample_weight) - @classproperty - def presets(cls): - return copy.deepcopy(backbone_presets) - def generate_preprocess( self, x, @@ -212,6 +187,9 @@ def generate_preprocess( the decoder sequence (as generation is expected to continue at the end of the inputted decoder prompt). """ + if not self.built: + self.build(None) + # If `sequence_length` is not provided, we use the default value. if sequence_length is None: sequence_length = self._decoder_sequence_length @@ -262,14 +240,15 @@ def generate_postprocess( padding and start/end tokens, and then converting the integer sequence back to a string. """ + if not self.built: + self.build(None) + decoder_token_ids, decoder_padding_mask = ( x["decoder_token_ids"], x["decoder_padding_mask"], ) - if not isinstance(decoder_token_ids, tf.Tensor): - decoder_token_ids = ops.convert_to_numpy(decoder_token_ids) - if not isinstance(decoder_padding_mask, tf.Tensor): - decoder_padding_mask = ops.convert_to_numpy(decoder_padding_mask) + decoder_token_ids = ops.convert_to_numpy(decoder_token_ids) + decoder_padding_mask = ops.convert_to_numpy(decoder_padding_mask) # Strip any special tokens during detokenization, i.e., the start and # end markers. In the future, we could make this configurable. decoder_padding_mask = ( @@ -281,3 +260,17 @@ def generate_postprocess( decoder_token_ids, decoder_padding_mask ) return self.tokenizer.detokenize(decoder_token_ids) + + def get_config(self): + config = super().get_config() + config.update( + { + "encoder_sequence_length": self._encoder_sequence_length, + "decoder_sequence_length": self._decoder_sequence_length, + } + ) + return config + + @classproperty + def presets(cls): + return copy.deepcopy(backbone_presets) diff --git a/keras_nlp/models/bart/bart_seq_2_seq_lm_preprocessor_test.py b/keras_nlp/models/bart/bart_seq_2_seq_lm_preprocessor_test.py index 41a3c3dab2..33fbd5fc3a 100644 --- a/keras_nlp/models/bart/bart_seq_2_seq_lm_preprocessor_test.py +++ b/keras_nlp/models/bart/bart_seq_2_seq_lm_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.bart.bart_seq_2_seq_lm_preprocessor import ( BartSeq2SeqLMPreprocessor, ) @@ -24,129 +23,75 @@ class BartSeq2SeqLMPreprocessorTest(TestCase): def setUp(self): - vocab = { - "": 0, - "": 1, - "": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, - "Ġkoh": 7, - "li": 8, - "Ġis": 9, - "Ġthe": 10, - "Ġbest": 11, - "": 12, - } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - - self.preprocessor = BartSeq2SeqLMPreprocessor( - tokenizer=BartTokenizer( - vocabulary=vocab, - merges=merges, - ), - encoder_sequence_length=10, - decoder_sequence_length=9, + self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] + self.vocab += ["port", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.tokenizer = BartTokenizer( + vocabulary=self.vocab, merges=self.merges ) - - def test_tokenize_strings(self): - input_data = { - "encoder_text": " airplane at airport", - "decoder_text": " kohli is the best", + self.init_kwargs = { + "tokenizer": self.tokenizer, + "encoder_sequence_length": 5, + "decoder_sequence_length": 8, } - - x_out, y_out, sw_out = self.preprocessor(input_data) - self.assertAllEqual( - x_out["encoder_token_ids"], [0, 3, 4, 5, 3, 6, 2, 1, 1, 1] - ) - self.assertAllEqual( - x_out["encoder_padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0] + self.input_data = ( + { + "encoder_text": [" airplane at airport"], + "decoder_text": [" airplane airport"], + }, ) - self.assertAllEqual( - x_out["decoder_token_ids"], [2, 0, 7, 8, 9, 10, 11, 2, 1] - ) - self.assertAllEqual( - x_out["decoder_padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 0] - ) - self.assertAllEqual(y_out, [0, 7, 8, 9, 10, 11, 2, 1, 1]) - self.assertAllEqual(sw_out, [1, 1, 1, 1, 1, 1, 1, 0, 0]) - def test_tokenize_list_of_strings(self): - input_data = { - "encoder_text": [" airplane at airport"] * 4, - "decoder_text": [" kohli is the best"] * 4, - } - - x_out, y_out, sw_out = self.preprocessor(input_data) - self.assertAllEqual( - x_out["encoder_token_ids"], [[0, 3, 4, 5, 3, 6, 2, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - x_out["encoder_padding_mask"], - [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]] * 4, - ) - self.assertAllEqual( - x_out["decoder_token_ids"], [[2, 0, 7, 8, 9, 10, 11, 2, 1]] * 4 - ) - self.assertAllEqual( - x_out["decoder_padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - self.assertAllEqual(y_out, [[0, 7, 8, 9, 10, 11, 2, 1, 1]] * 4) - self.assertAllEqual(sw_out, [[1, 1, 1, 1, 1, 1, 1, 0, 0]] * 4) - - def test_error_multi_segment_input(self): - input_data = { - "encoder_text": ( - tf.constant([" airplane at airport"] * 2), - tf.constant([" airplane"] * 2), + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=BartSeq2SeqLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "encoder_token_ids": [[0, 4, 5, 6, 2]], + "encoder_padding_mask": [[1, 1, 1, 1, 1]], + "decoder_token_ids": [[2, 0, 4, 5, 4, 7, 2, 1]], + "decoder_padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, + [[0, 4, 5, 4, 7, 2, 1, 1]], + [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]], ), - "decoder_text": ( - tf.constant([" kohli is the best"] * 2), - tf.constant([" kohli"] * 2), - ), - } - - with self.assertRaises(ValueError): - self.preprocessor(input_data) + ) def test_generate_preprocess(self): + preprocessor = BartSeq2SeqLMPreprocessor(**self.init_kwargs) input_data = { - "encoder_text": tf.convert_to_tensor([" airplane at airport"]), - "decoder_text": tf.convert_to_tensor([" kohli is the best"]), + "encoder_text": [" airplane at airport"], + "decoder_text": [" airplane airport"], } - x_out = self.preprocessor.generate_preprocess(input_data) - self.assertAllEqual( - x_out["encoder_token_ids"], [[0, 3, 4, 5, 3, 6, 2, 1, 1, 1]] - ) - self.assertAllEqual( - x_out["encoder_padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]] - ) - self.assertAllEqual( - x_out["decoder_token_ids"], [[2, 0, 7, 8, 9, 10, 11, 1, 1]] - ) - self.assertAllEqual( - x_out["decoder_padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0]] + output = preprocessor.generate_preprocess(input_data) + self.assertAllClose( + output, + { + "encoder_token_ids": [[0, 4, 5, 6, 2]], + "encoder_padding_mask": [[1, 1, 1, 1, 1]], + "decoder_token_ids": [[2, 0, 4, 5, 4, 7, 1, 1]], + "decoder_padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]], + }, ) def test_generate_postprocess(self): + preprocessor = BartSeq2SeqLMPreprocessor(**self.init_kwargs) input_data = { - "decoder_token_ids": tf.constant([2, 0, 7, 8, 9, 10, 11, 1, 1]), - "decoder_padding_mask": tf.cast( - [1, 1, 1, 1, 1, 1, 1, 0, 0], dtype="bool" - ), + "decoder_token_ids": [0, 4, 5, 6, 2], + "decoder_padding_mask": [1, 1, 1, 1, 1], } - x = self.preprocessor.generate_postprocess(input_data) - self.assertAllEqual(x, " kohli is the best") + output = preprocessor.generate_postprocess(input_data) + self.assertAllEqual(output, " airplane at") - def test_serialization(self): - new_preprocessor = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.preprocessor) - ) - self.assertEqual( - new_preprocessor.get_config(), self.preprocessor.get_config() - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BartSeq2SeqLMPreprocessor.presets: + self.run_preset_test( + cls=BartSeq2SeqLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/bart/bart_seq_2_seq_lm_test.py b/keras_nlp/models/bart/bart_seq_2_seq_lm_test.py index 7c72580daf..280ec33dc6 100644 --- a/keras_nlp/models/bart/bart_seq_2_seq_lm_test.py +++ b/keras_nlp/models/bart/bart_seq_2_seq_lm_test.py @@ -12,13 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from unittest.mock import patch import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.bart.bart_backbone import BartBackbone from keras_nlp.models.bart.bart_seq_2_seq_lm import BartSeq2SeqLM @@ -31,26 +28,12 @@ class BartSeq2SeqLMTest(TestCase): def setUp(self): - self.vocab = { - "": 0, - "": 1, - "": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, - "Ġkoh": 7, - "li": 8, - "Ġis": 9, - "Ġthe": 10, - "Ġbest": 11, - "": 12, - } - - self.merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - self.merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e"] - self.merges += ["s t", "Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e"] - self.merges += ["Ġbe st", "po rt", "pla ne"] + self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] + self.vocab += ["port", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] self.preprocessor = BartSeq2SeqLMPreprocessor( BartTokenizer(vocabulary=self.vocab, merges=self.merges), encoder_sequence_length=12, @@ -64,64 +47,47 @@ def setUp(self): intermediate_dim=8, max_sequence_length=12, ) - self.seq_2_seq_lm = BartSeq2SeqLM( - backbone=self.backbone, - preprocessor=self.preprocessor, - ) - - self.raw_batch = { - "encoder_text": [" airplane at airport", " airplane at airport"], - "decoder_text": [" kohli is the best", " kohli is the best"], + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, } - - self.preprocessed_batch = self.preprocessor(self.raw_batch)[0] - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_seq_2_seq_lm(self): - self.seq_2_seq_lm(self.preprocessed_batch) - - def test_predict(self): - self.seq_2_seq_lm.predict(self.raw_batch) - self.seq_2_seq_lm.preprocessor = None - self.seq_2_seq_lm.predict(self.preprocessed_batch) - - def test_fit(self): - self.seq_2_seq_lm.fit(self.raw_dataset) - self.seq_2_seq_lm.preprocessor = None - self.seq_2_seq_lm.fit(self.preprocessed_dataset) - - def test_fit_no_xla(self): - self.seq_2_seq_lm.preprocessor = None - self.seq_2_seq_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.train_data = ( + { + "encoder_text": [ + " airplane at airport", + " airplane at airport", + ], + "decoder_text": [" airplane airport", " airplane airport"], + }, + ) + self.input_data = self.preprocessor(*self.train_data)[0] + + def test_causal_lm_basics(self): + self.run_task_test( + cls=BartSeq2SeqLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 10, 9), ) - self.seq_2_seq_lm.fit(self.preprocessed_dataset) def test_generate(self): # String input. inputs = { "encoder_text": " airplane at airport", - "decoder_text": " kohli is the best", + "decoder_text": " airplane at", } - output = self.seq_2_seq_lm.generate(inputs) - self.assertTrue(" kohli is the best" in output) + seq_2_seq_lm = BartSeq2SeqLM(**self.init_kwargs) + output = seq_2_seq_lm.generate(inputs) + self.assertTrue(" airplane at" in output) # String tensor input. self.assertIsInstance( - self.seq_2_seq_lm.generate(self.raw_batch)[0], str - ) - # String dataset input. - self.assertIsInstance( - self.seq_2_seq_lm.generate(self.raw_dataset)[0], str + seq_2_seq_lm.generate(" airplane at airport"), str ) # Int tensor input. - self.seq_2_seq_lm.preprocessor = None + seq_2_seq_lm.preprocessor = None preprocessed_batch = self.preprocessor.generate_preprocess(inputs) - outputs = self.seq_2_seq_lm.generate(preprocessed_batch) + outputs = seq_2_seq_lm.generate(preprocessed_batch) # Assert prompt is in output in token id space. self.assertAllEqual( outputs["decoder_token_ids"][:, :5], @@ -132,27 +98,9 @@ def test_generate(self): preprocessed_batch["decoder_padding_mask"][:, :5], ) - def test_generate_string_in_string_out(self): - # String input. - inputs = " airplane at airport" - self.seq_2_seq_lm.generate(inputs) - - # String tensor input. - self.assertIsInstance( - self.seq_2_seq_lm.generate( - [" airplane at airport", " airplane at airport"] - )[0], - str, - ) - - # String dataset input. - raw_dataset = tf.data.Dataset.from_tensor_slices( - tf.constant([" airplane at airport", " airplane at airport"]) - ).batch(2) - self.assertIsInstance(self.seq_2_seq_lm.generate(raw_dataset)[0], str) - def test_early_stopping(self): - call_decoder_with_cache = self.seq_2_seq_lm.call_decoder_with_cache + seq_2_seq_lm = BartSeq2SeqLM(**self.init_kwargs) + call_decoder_with_cache = seq_2_seq_lm.call_decoder_with_cache def wrapper(*args, **kwargs): """Modify output logits to always favor end_token_id""" @@ -174,61 +122,52 @@ def wrapper(*args, **kwargs): ) with patch.object( - self.seq_2_seq_lm, "call_decoder_with_cache", wraps=wrapper + seq_2_seq_lm, "call_decoder_with_cache", wraps=wrapper ): inputs = { "encoder_text": [ " airplane at airport", " airplane at airport", ], - "decoder_text": [" kohli is the best", " kohli"], + "decoder_text": [" airplane at", " airplane"], } - output = self.seq_2_seq_lm.generate(inputs) - + output = seq_2_seq_lm.generate(inputs) # We should immediately abort and output the prompt. self.assertAllEqual(inputs["decoder_text"], output) - # TODO: fix beam search. - @pytest.mark.tf_only - def test_beam_search(self): - seq_2_seq_lm = BartSeq2SeqLM( - backbone=self.backbone, - preprocessor=self.preprocessor, - ) - seq_2_seq_lm.compile(sampler="beam") - seq_2_seq_lm.generate(self.raw_batch) - def test_generate_compilation(self): + seq_2_seq_lm = BartSeq2SeqLM(**self.init_kwargs) # Assert we do not recompile with successive calls. - self.seq_2_seq_lm.generate(self.raw_batch) - first_fn = self.seq_2_seq_lm.generate_function - self.seq_2_seq_lm.generate(self.raw_batch) - second_fn = self.seq_2_seq_lm.generate_function + seq_2_seq_lm.generate(" airplane at airport") + first_fn = seq_2_seq_lm.generate_function + seq_2_seq_lm.generate(" airplane at airport") + second_fn = seq_2_seq_lm.generate_function self.assertEqual(first_fn, second_fn) # Assert we do recompile after compile is called. - self.seq_2_seq_lm.compile(sampler="greedy") - self.assertIsNone(self.seq_2_seq_lm.generate_function) + seq_2_seq_lm.compile(sampler="greedy") + self.assertIsNone(seq_2_seq_lm.generate_function) - def test_serialization(self): - new_seq_2_seq_lm = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.seq_2_seq_lm) - ) - self.assertEqual( - new_seq_2_seq_lm.get_config(), self.seq_2_seq_lm.get_config() + def test_beam_search(self): + seq_2_seq_lm = BartSeq2SeqLM( + backbone=self.backbone, + preprocessor=self.preprocessor, ) + seq_2_seq_lm.compile(sampler="beam") + seq_2_seq_lm.generate(" airplane at airport") @pytest.mark.large def test_saved_model(self): - keras.utils.set_random_seed(42) - model_output = self.seq_2_seq_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.seq_2_seq_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, BartSeq2SeqLM) + self.run_model_saving_test( + cls=BartSeq2SeqLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - keras.utils.set_random_seed(42) - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BartSeq2SeqLM.presets: + self.run_preset_test( + cls=BartSeq2SeqLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/bart/bart_tokenizer.py b/keras_nlp/models/bart/bart_tokenizer.py index 0f8728d7dd..17fb237b88 100644 --- a/keras_nlp/models/bart/bart_tokenizer.py +++ b/keras_nlp/models/bart/bart_tokenizer.py @@ -78,34 +78,45 @@ class BartTokenizer(BytePairTokenizer): def __init__( self, - vocabulary, - merges, + vocabulary=None, + merges=None, **kwargs, ): - # Special tokens. - start_token = "" - pad_token = "" - end_token = "" + self.start_token = "" + self.pad_token = "" + self.end_token = "" super().__init__( vocabulary=vocabulary, merges=merges, - unsplittable_tokens=[start_token, pad_token, end_token], + unsplittable_tokens=[ + self.start_token, + self.pad_token, + self.end_token, + ], **kwargs, ) - # Check whether special tokens are present in the vocabulary. - for token in [start_token, pad_token, end_token]: - if token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{token}'` in the provided " - f"`vocabulary`. Please provide `'{token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.start_token_id = self.token_to_id(start_token) - self.pad_token_id = self.token_to_id(pad_token) - self.end_token_id = self.token_to_id(end_token) + def set_vocabulary_and_merges(self, vocabulary, merges): + super().set_vocabulary_and_merges(vocabulary, merges) + + if vocabulary is not None: + # Check for necessary special tokens. + for token in [self.start_token, self.pad_token, self.end_token]: + if token not in self.vocabulary: + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.start_token_id = self.token_to_id(self.start_token) + self.pad_token_id = self.token_to_id(self.pad_token) + self.end_token_id = self.token_to_id(self.end_token) + else: + self.start_token_id = None + self.pad_token_id = None + self.end_token_id = None @classproperty def presets(cls): diff --git a/keras_nlp/models/bart/bart_tokenizer_test.py b/keras_nlp/models/bart/bart_tokenizer_test.py index d3e77958e8..5a0015357b 100644 --- a/keras_nlp/models/bart/bart_tokenizer_test.py +++ b/keras_nlp/models/bart/bart_tokenizer_test.py @@ -12,66 +12,57 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend import keras +import pytest + from keras_nlp.models.bart.bart_tokenizer import BartTokenizer from keras_nlp.tests.test_case import TestCase class BartTokenizerTest(TestCase): def setUp(self): - vocab = { - "": 0, - "": 1, - "": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, - "Ġkoh": 7, - "li": 8, - "Ġis": 9, - "Ġthe": 10, - "Ġbest": 11, - } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - - self.tokenizer = BartTokenizer(vocabulary=vocab, merges=merges) - - def test_tokenize(self): - input_data = " airplane at airport" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [3, 4, 5, 3, 6]) - - def test_tokenize_special_tokens(self): - input_data = " airplane at airport" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [0, 3, 4, 5, 3, 6, 0, 1]) + self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] + self.vocab += ["port", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} + self.input_data = [ + " airplane at airport", + " airplane airport", + ] - def test_tokenize_batch(self): - input_data = [" airplane at airport", " kohli is the best"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[3, 4, 5, 3, 6], [7, 8, 9, 10, 11]]) - - def test_detokenize(self): - input_tokens = [[3, 4, 5, 3, 6]] - output = self.tokenizer.detokenize(input_tokens) - self.assertAllEqual(output, [" airplane at airport"]) - - def test_vocabulary_size(self): - self.assertEqual(self.tokenizer.vocabulary_size(), 12) + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=BartTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + # TODO: should not get tokenized as + expected_output=[[0, 4, 5, 6, 4, 7, 0, 1], [4, 5, 4, 7]], + expected_detokenize_output=[ + " airplane at airport", + " airplane airport", + ], + ) def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): BartTokenizer(vocabulary=["a", "b", "c"], merges=[]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=BartTokenizer, + preset="bart_base_en", + input_data=["The quick brown fox."], + expected_output=[[133, 2119, 6219, 23602, 4]], ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BartTokenizer.presets: + self.run_preset_test( + cls=BartTokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/bert/bert_backbone.py b/keras_nlp/models/bert/bert_backbone.py index 381f1f8cb4..174b0f0e42 100644 --- a/keras_nlp/models/bert/bert_backbone.py +++ b/keras_nlp/models/bert/bert_backbone.py @@ -21,6 +21,7 @@ from keras_nlp.layers.modeling.transformer_encoder import TransformerEncoder from keras_nlp.models.backbone import Backbone from keras_nlp.models.bert.bert_presets import backbone_presets +from keras_nlp.utils.keras_utils import gelu_approximate from keras_nlp.utils.python_utils import classproperty @@ -151,9 +152,7 @@ def __init__( x = TransformerEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, - activation=lambda x: keras.activations.gelu( - x, approximate=True - ), + activation=gelu_approximate, dropout=dropout, layer_norm_epsilon=1e-12, kernel_initializer=bert_kernel_initializer(), @@ -163,13 +162,12 @@ def __init__( # Construct the two BERT outputs. The pooled output is a dense layer on # top of the [CLS] token. sequence_output = x - x = keras.layers.Dense( + pooled_output = keras.layers.Dense( hidden_dim, kernel_initializer=bert_kernel_initializer(), activation="tanh", name="pooled_dense", - )(x) - pooled_output = x[:, cls_token_index, :] + )(x[:, cls_token_index, :]) # Instantiate using Functional API Model constructor super().__init__( diff --git a/keras_nlp/models/bert/bert_backbone_test.py b/keras_nlp/models/bert/bert_backbone_test.py index 6374a6848f..c1039114f5 100644 --- a/keras_nlp/models/bert/bert_backbone_test.py +++ b/keras_nlp/models/bert/bert_backbone_test.py @@ -12,103 +12,78 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.models.bert.bert_backbone import BertBackbone from keras_nlp.tests.test_case import TestCase class BertBackboneTest(TestCase): def setUp(self): - self.backbone = BertBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "segment_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "segment_ids": ops.zeros((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_bert(self): - self.backbone(self.input_batch) - - def test_token_embedding(self): - output = self.backbone.token_embedding(self.input_batch["token_ids"]) - self.assertEqual(output.shape, (2, 5, 2)) - - def test_name(self): - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "bert_backbone") - - def test_variable_sequence_length_call_bert(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "segment_ids": np.ones((2, seq_length), dtype="int32"), - "padding_mask": np.ones((2, seq_length), dtype="int32"), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=BertBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape={ + "sequence_output": (2, 5, 2), + "pooled_output": (2, 2), + }, ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, BertBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose(model_output, restored_output) + self.run_model_saving_test( + cls=BertBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=BertBackbone, + preset="bert_tiny_en_uncased", + input_data={ + "token_ids": ops.array([[101, 1996, 4248, 102]], dtype="int32"), + "segment_ids": ops.zeros((1, 4), dtype="int32"), + "padding_mask": ops.ones((1, 4), dtype="int32"), + }, + expected_output_shape={ + "sequence_output": (1, 4, 128), + "pooled_output": (1, 128), + }, + # The forward pass from a preset should be stable! + expected_partial_output={ + "sequence_output": ( + ops.array([-1.38173, 0.16598, -2.92788, -2.66958, -0.61556]) + ), + "pooled_output": ( + ops.array([-0.99999, 0.07777, -0.99955, -0.00982, -0.99967]) + ), + }, + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class BertBackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = BertBackbone( - vocabulary_size=1000, - num_layers=2, - num_heads=2, - hidden_dim=64, - intermediate_dim=128, - max_sequence_length=128, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BertBackbone.presets: + self.run_preset_test( + cls=BertBackbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "token_ids": np.ones((8, 128), dtype="int32"), - "segment_ids": np.ones((8, 128), dtype="int32"), - "padding_mask": np.ones((8, 128), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) diff --git a/keras_nlp/models/bert/bert_classifier_test.py b/keras_nlp/models/bert/bert_classifier_test.py index 92122335b5..53da9d4fab 100644 --- a/keras_nlp/models/bert/bert_classifier_test.py +++ b/keras_nlp/models/bert/bert_classifier_test.py @@ -12,14 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras -from keras_nlp.backend import ops from keras_nlp.models.bert.bert_backbone import BertBackbone from keras_nlp.models.bert.bert_classifier import BertClassifier from keras_nlp.models.bert.bert_preprocessor import BertPreprocessor @@ -42,85 +36,42 @@ def setUp(self): num_heads=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.classifier = BertClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - # Check we handle serialization correctly. - activation=keras.activations.softmax, + max_sequence_length=self.preprocessor.sequence_length, ) - - # Setup data. - self.raw_batch = [ - "the quick brown fox.", - "the slow brown fox.", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - (self.raw_batch, np.ones((2,))) - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.classifier(self.preprocessed_batch) - - def test_classifier_predict(self): - preds1 = self.classifier.predict(self.raw_batch) - self.classifier.preprocessor = None - preds2 = self.classifier.predict(self.preprocessed_batch) - # Assert predictions match. - self.assertAllClose(preds1, preds2) - # Assert valid softmax output. - self.assertAllClose(ops.sum(preds2, axis=-1), [1.0, 1.0]) - - def test_classifier_fit(self): - self.classifier.fit(self.raw_dataset) - self.classifier.preprocessor = None - self.classifier.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.classifier.preprocessor = None - self.classifier.compile( - optimizer="adam", - loss="sparse_categorical_crossentropy", - jit_compile=False, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + "num_classes": 2, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. + [1, 0], # Labels. ) - self.classifier.fit(self.preprocessed_dataset) + self.input_data = self.preprocessor(*self.train_data)[0] - def test_serialization(self): - # Defaults. - original = BertClassifier( - self.backbone, - num_classes=2, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) - # With options. - original = BertClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - activation=keras.activations.softmax, - name="test", - trainable=False, + def test_classifier_basics(self): + self.run_task_test( + cls=BertClassifier, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 2), ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) @pytest.mark.large - def test_saving_model(self): - model_output = self.classifier.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.classifier.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back - self.assertIsInstance(restored_model, BertClassifier) + def test_saved_model(self): + self.run_model_saving_test( + cls=BertClassifier, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BertClassifier.presets: + self.run_preset_test( + cls=BertClassifier, + preset=preset, + init_kwargs={"num_classes": 2}, + input_data=self.input_data, + expected_output_shape=(2, 2), + ) diff --git a/keras_nlp/models/bert/bert_masked_lm_preprocessor.py b/keras_nlp/models/bert/bert_masked_lm_preprocessor.py index 685c65c70b..cdc61fbac3 100644 --- a/keras_nlp/models/bert/bert_masked_lm_preprocessor.py +++ b/keras_nlp/models/bert/bert_masked_lm_preprocessor.py @@ -134,33 +134,30 @@ def __init__( truncate=truncate, **kwargs, ) - + self.mask_selection_rate = mask_selection_rate + self.mask_selection_length = mask_selection_length + self.mask_token_rate = mask_token_rate + self.random_token_rate = random_token_rate + self.masker = None + + def build(self, input_shape): + super().build(input_shape) + # Defer masker creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.masker = MaskedLMMaskGenerator( - mask_selection_rate=mask_selection_rate, - mask_selection_length=mask_selection_length, - mask_token_rate=mask_token_rate, - random_token_rate=random_token_rate, - vocabulary_size=tokenizer.vocabulary_size(), - mask_token_id=tokenizer.mask_token_id, + mask_selection_rate=self.mask_selection_rate, + mask_selection_length=self.mask_selection_length, + mask_token_rate=self.mask_token_rate, + random_token_rate=self.random_token_rate, + vocabulary_size=self.tokenizer.vocabulary_size(), + mask_token_id=self.tokenizer.mask_token_id, unselectable_token_ids=[ - tokenizer.cls_token_id, - tokenizer.sep_token_id, - tokenizer.pad_token_id, + self.tokenizer.cls_token_id, + self.tokenizer.sep_token_id, + self.tokenizer.pad_token_id, ], ) - def get_config(self): - config = super().get_config() - config.update( - { - "mask_selection_rate": self.masker.mask_selection_rate, - "mask_selection_length": self.masker.mask_selection_length, - "mask_token_rate": self.masker.mask_token_rate, - "random_token_rate": self.masker.random_token_rate, - } - ) - return config - def call(self, x, y=None, sample_weight=None): if y is not None or sample_weight is not None: logging.warning( @@ -187,3 +184,15 @@ def call(self, x, y=None, sample_weight=None): y = masker_outputs["mask_ids"] sample_weight = masker_outputs["mask_weights"] return pack_x_y_sample_weight(x, y, sample_weight) + + def get_config(self): + config = super().get_config() + config.update( + { + "mask_selection_rate": self.mask_selection_rate, + "mask_selection_length": self.mask_selection_length, + "mask_token_rate": self.mask_token_rate, + "random_token_rate": self.random_token_rate, + } + ) + return config diff --git a/keras_nlp/models/bert/bert_masked_lm_preprocessor_test.py b/keras_nlp/models/bert/bert_masked_lm_preprocessor_test.py index ad208c24a3..ff58962215 100644 --- a/keras_nlp/models/bert/bert_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/bert/bert_masked_lm_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.bert.bert_masked_lm_preprocessor import ( BertMaskedLMPreprocessor, ) @@ -27,106 +26,62 @@ def setUp(self): self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.vocab += ["THE", "QUICK", "BROWN", "FOX"] self.vocab += ["the", "quick", "brown", "fox"] - - tokenizer = BertTokenizer(vocabulary=self.vocab) - - self.preprocessor = BertMaskedLMPreprocessor( - tokenizer=tokenizer, + self.tokenizer = BertTokenizer(vocabulary=self.vocab) + self.init_kwargs = { + "tokenizer": self.tokenizer, # Simplify our testing by masking every available token. - mask_selection_rate=1.0, - mask_token_rate=1.0, - random_token_rate=0.0, - mask_selection_length=4, - sequence_length=12, - ) - - def test_preprocess_strings(self): - input_data = "the quick brown fox" - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - x["segment_ids"], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + "mask_selection_rate": 1.0, + "mask_token_rate": 1.0, + "random_token_rate": 0.0, + "mask_selection_length": 4, + "sequence_length": 12, + } + self.input_data = ["the quick brown fox"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=BertMaskedLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0]], + "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[1, 2, 3, 4]], + }, + [[9, 10, 11, 12]], + [[1.0, 1.0, 1.0, 1.0]], + ), ) - self.assertAllEqual(x["mask_positions"], [1, 2, 3, 4]) - self.assertAllEqual(y, [9, 10, 11, 12]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0]) - - def test_preprocess_list_of_strings(self): - input_data = ["the quick brown fox"] * 4 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [[2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x["padding_mask"], - [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4, - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4]] * 4) - self.assertAllEqual(y, [[9, 10, 11, 12]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0]] * 4) - - def test_preprocess_dataset(self): - sentences = tf.constant(["the quick brown fox"] * 4) - ds = tf.data.Dataset.from_tensor_slices(sentences) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x["token_ids"], [[2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4]] * 4) - self.assertAllEqual(y, [[9, 10, 11, 12]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0]] * 4) - - def test_mask_multiple_sentences(self): - sentence_one = tf.constant("the quick") - sentence_two = tf.constant("brown fox") - - x, y, sw = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - x["token_ids"], [2, 4, 4, 3, 4, 4, 3, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [1, 2, 4, 5]) - self.assertAllEqual(y, [9, 10, 11, 12]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0]) def test_no_masking_zero_rate(self): no_mask_preprocessor = BertMaskedLMPreprocessor( - self.preprocessor.tokenizer, + self.tokenizer, mask_selection_rate=0.0, mask_selection_length=4, sequence_length=12, ) - input_data = "the quick brown fox" - - x, y, sw = no_mask_preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [2, 9, 10, 11, 12, 3, 0, 0, 0, 0, 0, 0] + input_data = ["the quick brown fox"] + self.assertAllClose( + no_mask_preprocessor(input_data), + ( + { + "token_ids": [[2, 9, 10, 11, 12, 3, 0, 0, 0, 0, 0, 0]], + "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[0, 0, 0, 0]], + }, + [[0, 0, 0, 0]], + [[0.0, 0.0, 0.0, 0.0]], + ), ) - self.assertAllEqual( - x["padding_mask"], - [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], - ) - self.assertAllEqual(x["mask_positions"], [0, 0, 0, 0]) - self.assertAllEqual(y, [0, 0, 0, 0]) - self.assertAllEqual(sw, [0.0, 0.0, 0.0, 0.0]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BertMaskedLMPreprocessor.presets: + self.run_preset_test( + cls=BertMaskedLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/bert/bert_masked_lm_test.py b/keras_nlp/models/bert/bert_masked_lm_test.py index 8659de2474..dd6f41b0b7 100644 --- a/keras_nlp/models/bert/bert_masked_lm_test.py +++ b/keras_nlp/models/bert/bert_masked_lm_test.py @@ -12,12 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.bert.bert_backbone import BertBackbone from keras_nlp.models.bert.bert_masked_lm import BertMaskedLM from keras_nlp.models.bert.bert_masked_lm_preprocessor import ( @@ -47,63 +43,38 @@ def setUp(self): num_heads=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.masked_lm = BertMaskedLM( - self.backbone, - preprocessor=self.preprocessor, + max_sequence_length=self.preprocessor.sequence_length, ) - - # Setup data. - self.raw_batch = [ - "the quick brown fox.", - "the slow brown fox.", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call(self): - self.masked_lm(self.preprocessed_batch[0]) - - def test_predict(self): - self.masked_lm.predict(self.raw_batch) - self.masked_lm.preprocessor = None - self.masked_lm.predict(self.preprocessed_batch[0]) - - def test_fit(self): - self.masked_lm.fit(self.raw_dataset) - self.masked_lm.preprocessor = None - self.masked_lm.fit(self.preprocessed_dataset) - - def test_fit_no_xla(self): - self.masked_lm.preprocessor = None - self.masked_lm.compile( - optimizer="adam", - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. ) - self.masked_lm.fit(self.preprocessed_dataset) + self.input_data = self.preprocessor(*self.train_data)[0] - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.masked_lm) - new_classifier = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_classifier.get_config(), - self.masked_lm.get_config(), + def test_masked_lm_basics(self): + self.run_task_test( + cls=BertMaskedLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 5, 10), ) @pytest.mark.large def test_saved_model(self): - model_output = self.masked_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.masked_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) + self.run_model_saving_test( + cls=BertMaskedLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check we got the real object back. - self.assertIsInstance(restored_model, BertMaskedLM) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output, atol=0.01, rtol=0.01) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BertMaskedLM.presets: + self.run_preset_test( + cls=BertMaskedLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/bert/bert_preprocessor.py b/keras_nlp/models/bert/bert_preprocessor.py index 214193753d..bad38f22a5 100644 --- a/keras_nlp/models/bert/bert_preprocessor.py +++ b/keras_nlp/models/bert/bert_preprocessor.py @@ -139,23 +139,21 @@ def __init__( ): super().__init__(**kwargs) self.tokenizer = tokenizer + self.sequence_length = sequence_length + self.truncate = truncate + self.packer = None + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = MultiSegmentPacker( start_value=self.tokenizer.cls_token_id, end_value=self.tokenizer.sep_token_id, pad_value=self.tokenizer.pad_token_id, - truncate=truncate, - sequence_length=sequence_length, + truncate=self.truncate, + sequence_length=self.sequence_length, ) - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.packer.sequence_length, - "truncate": self.packer.truncate, - } - ) - return config + self.built = True def call(self, x, y=None, sample_weight=None): x = convert_inputs_to_list_of_tensor_segments(x) @@ -168,6 +166,16 @@ def call(self, x, y=None, sample_weight=None): } return pack_x_y_sample_weight(x, y, sample_weight) + def get_config(self): + config = super().get_config() + config.update( + { + "sequence_length": self.sequence_length, + "truncate": self.truncate, + } + ) + return config + @classproperty def tokenizer_cls(cls): return BertTokenizer diff --git a/keras_nlp/models/bert/bert_preprocessor_test.py b/keras_nlp/models/bert/bert_preprocessor_test.py index efedb2d550..6d1e5fee57 100644 --- a/keras_nlp/models/bert/bert_preprocessor_test.py +++ b/keras_nlp/models/bert/bert_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.bert.bert_preprocessor import BertPreprocessor from keras_nlp.models.bert.bert_tokenizer import BertTokenizer from keras_nlp.tests.test_case import TestCase @@ -25,93 +24,44 @@ def setUp(self): self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.vocab += ["THE", "QUICK", "BROWN", "FOX"] self.vocab += ["the", "quick", "brown", "fox"] - self.preprocessor = BertPreprocessor( - BertTokenizer(vocabulary=self.vocab), - sequence_length=8, + self.tokenizer = BertTokenizer(vocabulary=self.vocab) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ( + ["THE QUICK BROWN FOX."], + [1], # Pass through labels. + [1.0], # Pass through sample_weights. ) - def test_tokenize_strings(self): - input_data = "THE QUICK BROWN FOX." - output = self.preprocessor(input_data) - self.assertAllEqual(output["token_ids"], [2, 5, 6, 7, 8, 1, 3, 0]) - self.assertAllEqual(output["segment_ids"], [0, 0, 0, 0, 0, 0, 0, 0]) - self.assertAllEqual(output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - - def test_tokenize_list_of_strings(self): - # We should handle a list of strings as as batch. - input_data = ["THE QUICK BROWN FOX."] * 4 - output = self.preprocessor(input_data) - self.assertAllEqual(output["token_ids"], [[2, 5, 6, 7, 8, 1, 3, 0]] * 4) - self.assertAllEqual( - output["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - - def test_tokenize_labeled_batch(self): - x = tf.constant(["THE QUICK BROWN FOX."] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - x_out, y_out, sw_out = self.preprocessor(x, y, sw) - self.assertAllEqual(x_out["token_ids"], [[2, 5, 6, 7, 8, 1, 3, 0]] * 4) - self.assertAllEqual( - x_out["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_labeled_dataset(self): - x = tf.constant(["THE QUICK BROWN FOX."] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - ds = tf.data.Dataset.from_tensor_slices((x, y, sw)) - ds = ds.map(self.preprocessor) - x_out, y_out, sw_out = ds.batch(4).take(1).get_single_element() - self.assertAllEqual(x_out["token_ids"], [[2, 5, 6, 7, 8, 1, 3, 0]] * 4) - self.assertAllEqual( - x_out["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_multiple_sentences(self): - sentence_one = tf.constant("THE QUICK") - sentence_two = tf.constant("BROWN FOX.") - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual(output["token_ids"], [2, 5, 6, 3, 7, 8, 1, 3]) - self.assertAllEqual(output["segment_ids"], [0, 0, 0, 0, 1, 1, 1, 1]) - self.assertAllEqual(output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1]) - - def test_tokenize_multiple_batched_sentences(self): - sentence_one = tf.constant(["THE QUICK"] * 4) - sentence_two = tf.constant(["BROWN FOX."] * 4) - # The first tuple or list is always interpreted as an enumeration of - # separate sequences to concatenate. - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual(output["token_ids"], [[2, 5, 6, 3, 7, 8, 1, 3]] * 4) - self.assertAllEqual( - output["segment_ids"], [[0, 0, 0, 0, 1, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1]] * 4 + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=BertPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[2, 5, 6, 7, 8, 1, 3, 0]], + "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ), ) def test_errors_for_2d_list_input(self): + preprocessor = BertPreprocessor(**self.init_kwargs) ambiguous_input = [["one", "two"], ["three", "four"]] with self.assertRaises(ValueError): - self.preprocessor(ambiguous_input) + preprocessor(ambiguous_input) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BertPreprocessor.presets: + self.run_preset_test( + cls=BertPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/bert/bert_presets.py b/keras_nlp/models/bert/bert_presets.py index 7a3bbdce63..09315db192 100644 --- a/keras_nlp/models/bert/bert_presets.py +++ b/keras_nlp/models/bert/bert_presets.py @@ -13,8 +13,6 @@ # limitations under the License. """BERT model preset configurations.""" -# TODO(jbischof): document presets in keras.io and use URL in docstrings -# Metadata for loading pretrained model weights. backbone_presets = { "bert_tiny_en_uncased": { "metadata": { @@ -27,23 +25,7 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "vocabulary_size": 30522, - "num_layers": 2, - "num_heads": 2, - "hidden_dim": 128, - "intermediate_dim": 512, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": { - "lowercase": True, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_tiny_en_uncased/v1/model.h5", - "weights_hash": "c2b29fcbf8f814a0812e4ab89ef5c068", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_tiny_en_uncased/v1/vocab.txt", - "vocabulary_hash": "64800d5d8528ce344256daf115d4965e", + "kaggle_handle": "kaggle://keras/bert/keras/bert_tiny_en_uncased/2", }, "bert_small_en_uncased": { "metadata": { @@ -56,23 +38,7 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "vocabulary_size": 30522, - "num_layers": 4, - "num_heads": 8, - "hidden_dim": 512, - "intermediate_dim": 2048, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": { - "lowercase": True, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_small_en_uncased/v1/model.h5", - "weights_hash": "08632c9479b034f342ba2c2b7afba5f7", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_small_en_uncased/v1/vocab.txt", - "vocabulary_hash": "64800d5d8528ce344256daf115d4965e", + "kaggle_handle": "kaggle://keras/bert/keras/bert_small_en_uncased/2", }, "bert_medium_en_uncased": { "metadata": { @@ -85,23 +51,7 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "vocabulary_size": 30522, - "num_layers": 8, - "num_heads": 8, - "hidden_dim": 512, - "intermediate_dim": 2048, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": { - "lowercase": True, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_medium_en_uncased/v1/model.h5", - "weights_hash": "bb990e1184ec6b6185450c73833cd661", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_medium_en_uncased/v1/vocab.txt", - "vocabulary_hash": "64800d5d8528ce344256daf115d4965e", + "kaggle_handle": "kaggle://keras/bert/keras/bert_medium_en_uncased/2", }, "bert_base_en_uncased": { "metadata": { @@ -114,23 +64,7 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "vocabulary_size": 30522, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": { - "lowercase": True, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_base_en_uncased/v1/model.h5", - "weights_hash": "9b2b2139f221988759ac9cdd17050b31", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_base_en_uncased/v1/vocab.txt", - "vocabulary_hash": "64800d5d8528ce344256daf115d4965e", + "kaggle_handle": "kaggle://keras/bert/keras/bert_base_en_uncased/2", }, "bert_base_en": { "metadata": { @@ -143,23 +77,7 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "vocabulary_size": 28996, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": { - "lowercase": False, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_base_en/v1/model.h5", - "weights_hash": "f94a6cb012e18f4fb8ec92abb91864e9", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_base_en/v1/vocab.txt", - "vocabulary_hash": "bb6ca9b42e790e5cd986bbb16444d0e0", + "kaggle_handle": "kaggle://keras/bert/keras/bert_base_en/2", }, "bert_base_zh": { "metadata": { @@ -171,23 +89,7 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "vocabulary_size": 21128, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": { - "lowercase": False, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_base_zh/v1/model.h5", - "weights_hash": "79afa421e386076e62ab42dad555ab0c", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_base_zh/v1/vocab.txt", - "vocabulary_hash": "3b5b76c4aef48ecf8cb3abaafe960f09", + "kaggle_handle": "kaggle://keras/bert/keras/bert_base_zh/2", }, "bert_base_multi": { "metadata": { @@ -199,23 +101,7 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "vocabulary_size": 119547, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": { - "lowercase": False, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_base_multi/v1/model.h5", - "weights_hash": "b0631cec0a1f2513c6cfd75ba29c33aa", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_base_multi/v1/vocab.txt", - "vocabulary_hash": "d9d865138d17f1958502ed060ecfeeb6", + "kaggle_handle": "kaggle://keras/bert/keras/bert_base_multi/2", }, "bert_large_en_uncased": { "metadata": { @@ -228,23 +114,7 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "vocabulary_size": 30522, - "num_layers": 24, - "num_heads": 16, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": { - "lowercase": True, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_large_en_uncased/v1/model.h5", - "weights_hash": "cc5cacc9565ef400ee4376105f40ddae", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_large_en_uncased/v1/vocab.txt", - "vocabulary_hash": "64800d5d8528ce344256daf115d4965e", + "kaggle_handle": "kaggle://keras/bert/keras/bert_large_en_uncased/2", }, "bert_large_en": { "metadata": { @@ -257,23 +127,7 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "vocabulary_size": 28996, - "num_layers": 24, - "num_heads": 16, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 2, - }, - "preprocessor_config": { - "lowercase": False, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_large_en/v1/model.h5", - "weights_hash": "8b8ab82290bbf4f8db87d4f100648890", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_large_en/v1/vocab.txt", - "vocabulary_hash": "bb6ca9b42e790e5cd986bbb16444d0e0", + "kaggle_handle": "kaggle://keras/bert/keras/bert_large_en/2", }, } @@ -288,29 +142,6 @@ "path": "bert", "model_card": "https://github.com/google-research/bert/blob/master/README.md", }, - "config": { - "backbone": { - "class_name": "keras_nlp>BertBackbone", - "config": { - "vocabulary_size": 30522, - "hidden_dim": 128, - "intermediate_dim": 512, - "num_layers": 2, - "num_heads": 2, - "max_sequence_length": 512, - "num_segments": 2, - "dropout": 0.1, - }, - }, - "num_classes": 2, - "dropout": 0.1, - }, - "preprocessor_config": { - "lowercase": True, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/bert_tiny_en_uncased_sst2/v1/model.h5", - "weights_hash": "1f9c2d59f9e229e08f3fbd44239cfb0b", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/bert_tiny_en_uncased_sst2/v1/vocab.txt", - "vocabulary_hash": "64800d5d8528ce344256daf115d4965e", + "kaggle_handle": "kaggle://keras/bert/keras/bert_tiny_en_uncased_sst2/3", } } diff --git a/keras_nlp/models/bert/bert_presets_test.py b/keras_nlp/models/bert/bert_presets_test.py deleted file mode 100644 index 71e739bbfc..0000000000 --- a/keras_nlp/models/bert/bert_presets_test.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from absl.testing import parameterized - -from keras_nlp.backend import ops -from keras_nlp.models.bert.bert_backbone import BertBackbone -from keras_nlp.models.bert.bert_classifier import BertClassifier -from keras_nlp.models.bert.bert_preprocessor import BertPreprocessor -from keras_nlp.models.bert.bert_tokenizer import BertTokenizer -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.large -class BertPresetSmokeTest(TestCase): - """ - A smoke test for BERT presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/bert/bert_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = BertTokenizer.from_preset( - "bert_tiny_en_uncased", - ) - outputs = tokenizer("The quick brown fox.") - expected_outputs = [1996, 4248, 2829, 4419, 1012] - self.assertAllEqual(outputs, expected_outputs) - - def test_preprocessor_output(self): - tokenizer = BertPreprocessor.from_preset( - "bert_tiny_en_uncased", - sequence_length=4, - ) - outputs = tokenizer("The quick brown fox.")["token_ids"] - expected_outputs = [101, 1996, 4248, 102] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "token_ids": ops.array([[101, 1996, 4248, 102]]), - "segment_ids": ops.array([[0, 0, 0, 0]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = BertBackbone.from_preset( - "bert_tiny_en_uncased", load_weights=load_weights - ) - outputs = model(input_data)["sequence_output"] - if load_weights: - # The forward pass from a preset should be stable! - # This test should catch cases where we unintentionally change our - # network code in a way that would invalidate our preset weights. - # We should only update these numbers if we are updating a weights - # file, or have found a discrepancy with the upstream source. - outputs = outputs[0, 0, :5] - expected = [-1.38173, 0.16598, -2.92788, -2.66958, -0.61556] - # Keep a high tolerance, so we are robust to different hardware. - self.assertAllClose(outputs, expected, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_classifier_output(self, load_weights): - input_data = ["The quick brown fox."] - model = BertClassifier.from_preset( - "bert_tiny_en_uncased", - num_classes=2, - load_weights=load_weights, - ) - # We don't assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_classifier_output_without_preprocessing(self, load_weights): - input_data = { - "token_ids": ops.array([[101, 1996, 4248, 102]]), - "segment_ids": ops.array([[0, 0, 0, 0]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = BertClassifier.from_preset( - "bert_tiny_en_uncased", - num_classes=2, - load_weights=load_weights, - preprocessor=None, - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("bert_tokenizer", BertTokenizer, {}), - ("bert_preprocessor", BertPreprocessor, {}), - ("bert", BertBackbone, {}), - ("bert_classifier", BertClassifier, {"num_classes": 2}), - ) - def test_preset_mutability(self, cls, kwargs): - preset = "bert_tiny_en_uncased" - obj = cls.from_preset(preset, **kwargs) - # Cannot overwrite the presents attribute in an object - with self.assertRaises(AttributeError): - obj.presets = {"my_model": "clowntown"} - # Cannot mutate presents in an object - config = obj.presets[preset]["config"] - config["num_layers"] = 1 - self.assertEqual(config["num_layers"], 1) - self.assertEqual(obj.presets[preset]["config"]["num_layers"], 2) - # Cannot mutate presets in the class - config = BertBackbone.presets[preset]["config"] - config["num_layers"] = 1 - self.assertEqual(config["num_layers"], 1) - self.assertEqual( - BertBackbone.presets[preset]["config"]["num_layers"], 2 - ) - - @parameterized.named_parameters( - ("bert_tokenizer", BertTokenizer), - ("bert_preprocessor", BertPreprocessor), - ("bert", BertBackbone), - ("bert_classifier", BertClassifier), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("bert_tokenizer", BertTokenizer, {}), - ("bert_preprocessor", BertPreprocessor, {}), - ("bert", BertBackbone, {}), - ("bert_classifier", BertClassifier, {"num_classes": 2}), - ) - def test_unknown_preset_error(self, cls, kwargs): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("bert_base_uncased_clowntown", **kwargs) - - def test_override_preprocessor_sequence_length(self): - """Override sequence length longer than model's maximum.""" - preprocessor = BertPreprocessor.from_preset( - "bert_base_en_uncased", - sequence_length=64, - ) - self.assertEqual(preprocessor.get_config()["sequence_length"], 64) - preprocessor("The quick brown fox.") - - def test_override_preprocessor_sequence_length_gt_max(self): - """Override sequence length longer than model's maximum.""" - with self.assertRaises(ValueError): - BertPreprocessor.from_preset( - "bert_base_en_uncased", - sequence_length=1024, - ) - - -@pytest.mark.extra_large -class BertPresetFullTest(TestCase): - """ - Test the full enumeration of our preset. - - This every presets for BERT and is only run manually. - Run with: - `pytest keras_nlp/models/bert/bert_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_load_bert(self, load_weights): - for preset in BertBackbone.presets: - model = BertBackbone.from_preset(preset, load_weights=load_weights) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), dtype="int64", maxval=model.vocabulary_size - ), - "segment_ids": ops.array([0] * 200 + [1] * 312, shape=(1, 512)), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - model(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_load_bert_classifier(self, load_weights): - for preset in BertClassifier.presets: - classifier = BertClassifier.from_preset( - preset, - num_classes=2, - load_weights=load_weights, - ) - input_data = ["This quick brown fox."] - classifier.predict(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_load_bert_classifier_without_preprocessing(self, load_weights): - for preset in BertClassifier.presets: - classifier = BertClassifier.from_preset( - preset, - num_classes=2, - preprocessor=None, - load_weights=load_weights, - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), - dtype="int64", - maxval=classifier.backbone.vocabulary_size, - ), - "segment_ids": ops.array([0] * 200 + [1] * 312, shape=(1, 512)), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - classifier.predict(input_data) - - def test_load_tokenizers(self): - for preset in BertTokenizer.presets: - tokenizer = BertTokenizer.from_preset(preset) - tokenizer("The quick brown fox.") - - def test_load_preprocessors(self): - for preset in BertPreprocessor.presets: - preprocessor = BertPreprocessor.from_preset(preset) - preprocessor("The quick brown fox.") diff --git a/keras_nlp/models/bert/bert_tokenizer.py b/keras_nlp/models/bert/bert_tokenizer.py index 5c01bf24e1..1b634fe9b3 100644 --- a/keras_nlp/models/bert/bert_tokenizer.py +++ b/keras_nlp/models/bert/bert_tokenizer.py @@ -74,33 +74,42 @@ class BertTokenizer(WordPieceTokenizer): def __init__( self, - vocabulary, + vocabulary=None, lowercase=False, **kwargs, ): + self.cls_token = "[CLS]" + self.sep_token = "[SEP]" + self.pad_token = "[PAD]" + self.mask_token = "[MASK]" super().__init__( vocabulary=vocabulary, lowercase=lowercase, **kwargs, ) - # Check for necessary special tokens. - cls_token = "[CLS]" - sep_token = "[SEP]" - pad_token = "[PAD]" - mask_token = "[MASK]" - for token in [cls_token, pad_token, sep_token]: - if token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{token}'` in the provided " - f"`vocabulary`. Please provide `'{token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.cls_token_id = self.token_to_id(cls_token) - self.sep_token_id = self.token_to_id(sep_token) - self.pad_token_id = self.token_to_id(pad_token) - self.mask_token_id = self.token_to_id(mask_token) + def set_vocabulary(self, vocabulary): + super().set_vocabulary(vocabulary) + + if vocabulary is not None: + # Check for necessary special tokens. + for token in [self.cls_token, self.pad_token, self.sep_token]: + if token not in self.vocabulary: + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.cls_token_id = self.token_to_id(self.cls_token) + self.sep_token_id = self.token_to_id(self.sep_token) + self.pad_token_id = self.token_to_id(self.pad_token) + self.mask_token_id = self.token_to_id(self.mask_token) + else: + self.cls_token_id = None + self.sep_token_id = None + self.pad_token_id = None + self.mask_token_id = None @classproperty def presets(cls): diff --git a/keras_nlp/models/bert/bert_tokenizer_test.py b/keras_nlp/models/bert/bert_tokenizer_test.py index 2cd1baa490..e53419dab4 100644 --- a/keras_nlp/models/bert/bert_tokenizer_test.py +++ b/keras_nlp/models/bert/bert_tokenizer_test.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend import keras +import pytest + from keras_nlp.models.bert.bert_tokenizer import BertTokenizer from keras_nlp.tests.test_case import TestCase @@ -22,40 +23,40 @@ def setUp(self): self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.vocab += ["THE", "QUICK", "BROWN", "FOX"] self.vocab += ["the", "quick", "brown", "fox"] - self.tokenizer = BertTokenizer(vocabulary=self.vocab) - - def test_tokenize(self): - input_data = "THE QUICK BROWN FOX." - output = self.tokenizer(input_data) - self.assertAllEqual(output, [5, 6, 7, 8, 1]) - - def test_tokenize_batch(self): - input_data = ["THE QUICK BROWN FOX.", "THE FOX."] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[5, 6, 7, 8, 1], [5, 8, 1]]) + self.init_kwargs = {"vocabulary": self.vocab} + self.input_data = ["THE QUICK BROWN FOX", "THE FOX"] + + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=BertTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[5, 6, 7, 8], [5, 8]], + ) def test_lowercase(self): - input_data = "THE QUICK BROWN FOX." tokenizer = BertTokenizer(vocabulary=self.vocab, lowercase=True) - output = tokenizer(input_data) - self.assertAllEqual(output, [9, 10, 11, 12, 1]) - - def test_detokenize(self): - input_tokens = [[5, 6, 7, 8]] - output = self.tokenizer.detokenize(input_tokens) - self.assertAllEqual(output, ["THE QUICK BROWN FOX"]) - - def test_vocabulary_size(self): - self.assertEqual(self.tokenizer.vocabulary_size(), 13) + output = tokenizer(self.input_data) + self.assertAllEqual(output, [[9, 10, 11, 12], [9, 12]]) def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): BertTokenizer(vocabulary=["a", "b", "c"]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=BertTokenizer, + preset="bert_tiny_en_uncased", + input_data=["The quick brown fox."], + expected_output=[[1996, 4248, 2829, 4419, 1012]], ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in BertTokenizer.presets: + self.run_preset_test( + cls=BertTokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_backbone.py b/keras_nlp/models/deberta_v3/deberta_v3_backbone.py index b963d35433..aa5077ec67 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_backbone.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_backbone.py @@ -68,7 +68,7 @@ class DebertaV3Backbone(Backbone): bucket_size: int. The size of the relative position buckets. Generally equal to `max_sequence_length // 2`. - Example usage: + Example: ```python input_data = { "token_ids": np.ones(shape=(1, 12), dtype="int32"), @@ -153,9 +153,7 @@ def __init__( max_position_embeddings=max_sequence_length, bucket_size=bucket_size, dropout=dropout, - activation=lambda x: keras.activations.gelu( - x, approximate=False - ), + activation=keras.activations.gelu, layer_norm_epsilon=1e-7, kernel_initializer=deberta_kernel_initializer(), name=f"disentangled_attention_encoder_layer_{i}", diff --git a/keras_nlp/models/deberta_v3/deberta_v3_backbone_test.py b/keras_nlp/models/deberta_v3/deberta_v3_backbone_test.py index cd38d63f48..3559002864 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_backbone_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_backbone_test.py @@ -12,13 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone from keras_nlp.tests.test_case import TestCase @@ -26,94 +21,58 @@ class DebertaV3BackboneTest(TestCase): def setUp(self): - self.backbone = DebertaV3Backbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - bucket_size=2, - ) - self.batch_size = 8 - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "segment_ids": ops.zeros((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_deberta(self): - self.backbone(self.input_batch) - - def test_name(self): - self.assertRegexpMatches(self.backbone.name, "deberta_v3_backbone") - - def test_token_embedding(self): - output = self.backbone.token_embedding(self.input_batch["token_ids"]) - self.assertEqual(output.shape, (2, 5, 2)) - - def test_variable_sequence_length_call_deberta(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "padding_mask": np.ones((2, seq_length), dtype="int32"), - } - output = self.backbone(input_data) - self.assertAllEqual( - ops.shape(output), - [2, seq_length, self.backbone.hidden_dim], - ) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=DebertaV3Backbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 2), ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, DebertaV3Backbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose(model_output, restored_output) + self.run_model_saving_test( + cls=DebertaV3Backbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=DebertaV3Backbone, + preset="deberta_v3_extra_small_en", + input_data={ + "token_ids": ops.array([[0, 581, 63773, 2]], dtype="int32"), + "segment_ids": ops.zeros((1, 4), dtype="int32"), + "padding_mask": ops.ones((1, 4), dtype="int32"), + }, + expected_output_shape=(1, 4, 384), + # The forward pass from a preset should be stable! + expected_partial_output=ops.array( + [0.418, -0.116, -0.122, -1.847, -0.035] + ), + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class DebertaV3BackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = DebertaV3Backbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - bucket_size=2, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DebertaV3Backbone.presets: + self.run_preset_test( + cls=DebertaV3Backbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier.py index d477ab83a4..b03122064d 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_classifier.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_classifier.py @@ -170,7 +170,7 @@ def __init__( x = keras.layers.Dropout(dropout, name="pooled_dropout")(x) x = keras.layers.Dense( hidden_dim, - activation=lambda x: keras.activations.gelu(x, approximate=False), + activation=keras.activations.gelu, name="pooled_dense", )(x) x = keras.layers.Dropout(backbone.dropout, name="classifier_dropout")(x) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py index c53b0fd143..7d4f61e045 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py @@ -12,16 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os -import numpy as np import pytest -import sentencepiece -import tensorflow as tf -from keras_nlp.backend import keras -from keras_nlp.backend import ops from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone from keras_nlp.models.deberta_v3.deberta_v3_classifier import ( DebertaV3Classifier, @@ -35,27 +29,14 @@ class DebertaV3ClassifierTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=10, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", - user_defined_symbols="[MASK]", - ) + # Setup model. self.preprocessor = DebertaV3Preprocessor( - DebertaV3Tokenizer(proto=bytes_io.getvalue()), + DebertaV3Tokenizer( + # Generated using create_deberta_v3_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" + ) + ), sequence_length=5, ) self.backbone = DebertaV3Backbone( @@ -64,86 +45,42 @@ def setUp(self): num_heads=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, - bucket_size=2, + max_sequence_length=self.preprocessor.sequence_length, ) - self.classifier = DebertaV3Classifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - # Check we handle serialization correctly. - activation=keras.activations.softmax, - hidden_dim=4, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + "num_classes": 2, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. + [1, 0], # Labels. ) + self.input_data = self.preprocessor(*self.train_data)[0] - self.raw_batch = [ - "the quick brown fox.", - "the slow brown fox.", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - (self.raw_batch, np.ones((2,))) - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.classifier(self.preprocessed_batch) - - def test_classifier_predict(self): - preds1 = self.classifier.predict(self.raw_batch) - self.classifier.preprocessor = None - preds2 = self.classifier.predict(self.preprocessed_batch) - # Assert predictions match. - self.assertAllClose(preds1, preds2) - # Assert valid softmax output. - self.assertAllClose(ops.sum(preds2, axis=-1), [1.0, 1.0]) - - def test_classifier_fit(self): - self.classifier.fit(self.raw_dataset) - self.classifier.preprocessor = None - self.classifier.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.classifier.preprocessor = None - self.classifier.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + def test_classifier_basics(self): + self.run_task_test( + cls=DebertaV3Classifier, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 2), ) - self.classifier.fit(self.preprocessed_dataset) - - def test_serialization(self): - # Defaults. - original = DebertaV3Classifier( - self.backbone, - num_classes=2, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) - # With options. - original = DebertaV3Classifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - activation=keras.activations.softmax, - hidden_dim=4, - name="test", - trainable=False, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) @pytest.mark.large - def test_saving_model(self): - model_output = self.classifier.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.classifier.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, DebertaV3Classifier) + def test_saved_model(self): + self.run_model_saving_test( + cls=DebertaV3Classifier, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DebertaV3Classifier.presets: + self.run_preset_test( + cls=DebertaV3Classifier, + preset=preset, + init_kwargs={"num_classes": 2}, + input_data=self.input_data, + expected_output_shape=(2, 2), + ) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py index 91fdfbda5a..bf6a850a54 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py @@ -114,9 +114,7 @@ def __init__( outputs = MaskedLMHead( vocabulary_size=backbone.vocabulary_size, token_embedding=backbone.token_embedding, - intermediate_activation=lambda x: keras.activations.gelu( - x, approximate=False - ), + intermediate_activation=keras.activations.gelu, kernel_initializer=deberta_kernel_initializer(), name="mlm_head", )(backbone_outputs, inputs["mask_positions"]) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py index 1644c13823..519b0b4fca 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py @@ -133,17 +133,27 @@ def __init__( **kwargs, ) + self.mask_selection_rate = mask_selection_rate + self.mask_selection_length = mask_selection_length + self.mask_token_rate = mask_token_rate + self.random_token_rate = random_token_rate + self.masker = None + + def build(self, input_shape): + super().build(input_shape) + # Defer masker creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.masker = MaskedLMMaskGenerator( - mask_selection_rate=mask_selection_rate, - mask_selection_length=mask_selection_length, - mask_token_rate=mask_token_rate, - random_token_rate=random_token_rate, - vocabulary_size=tokenizer.vocabulary_size(), - mask_token_id=tokenizer.mask_token_id, + mask_selection_rate=self.mask_selection_rate, + mask_selection_length=self.mask_selection_length, + mask_token_rate=self.mask_token_rate, + random_token_rate=self.random_token_rate, + vocabulary_size=self.tokenizer.vocabulary_size(), + mask_token_id=self.tokenizer.mask_token_id, unselectable_token_ids=[ - tokenizer.cls_token_id, - tokenizer.sep_token_id, - tokenizer.pad_token_id, + self.tokenizer.cls_token_id, + self.tokenizer.sep_token_id, + self.tokenizer.pad_token_id, ], ) @@ -151,10 +161,10 @@ def get_config(self): config = super().get_config() config.update( { - "mask_selection_rate": self.masker.mask_selection_rate, - "mask_selection_length": self.masker.mask_selection_length, - "mask_token_rate": self.masker.mask_token_rate, - "random_token_rate": self.masker.random_token_rate, + "mask_selection_rate": self.mask_selection_rate, + "mask_selection_length": self.mask_selection_length, + "mask_token_rate": self.mask_token_rate, + "random_token_rate": self.random_token_rate, } ) return config diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py index 3c1f671297..217980ea59 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import ( DebertaV3MaskedLMPreprocessor, ) @@ -25,121 +23,67 @@ from keras_nlp.tests.test_case import TestCase -class DebertaV3PreprocessorTest(TestCase): +class DebertaV3MaskedLMPreprocessorTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + self.tokenizer = DebertaV3Tokenizer( + # Generated using create_deberta_v3_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" + ) ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - self.tokenizer = DebertaV3Tokenizer(proto=self.proto) - self.preprocessor = DebertaV3MaskedLMPreprocessor( - tokenizer=self.tokenizer, + self.init_kwargs = { + "tokenizer": self.tokenizer, # Simplify our testing by masking every available token. - mask_selection_rate=1.0, - mask_token_rate=1.0, - random_token_rate=0.0, - mask_selection_length=4, - sequence_length=12, - ) - - def test_preprocess_strings(self): - input_data = "the quick brown fox" - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [1, 4, 4, 4, 4, 2, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [1, 2, 3, 4]) - self.assertAllEqual(y, [5, 10, 6, 8]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0]) - - def test_preprocess_list_of_strings(self): - input_data = ["the quick brown fox"] * 4 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [[1, 4, 4, 4, 4, 2, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4]] * 4) - self.assertAllEqual(y, [[5, 10, 6, 8]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0]] * 4) - - def test_preprocess_dataset(self): - sentences = tf.constant(["the quick brown fox"] * 4) - ds = tf.data.Dataset.from_tensor_slices(sentences) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x["token_ids"], [[1, 4, 4, 4, 4, 2, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4]] * 4) - self.assertAllEqual(y, [[5, 10, 6, 8]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0]] * 4) + "mask_selection_rate": 1.0, + "mask_token_rate": 1.0, + "random_token_rate": 0.0, + "mask_selection_length": 4, + "sequence_length": 12, + } + self.input_data = ["the quick brown fox"] - def test_mask_multiple_sentences(self): - sentence_one = tf.constant("the quick") - sentence_two = tf.constant("brown fox") - - x, y, sw = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - x["token_ids"], [1, 4, 4, 2, 4, 4, 2, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=DebertaV3MaskedLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[1, 4, 4, 4, 4, 2, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[1, 2, 3, 4]], + }, + [[5, 10, 6, 8]], + [[1.0, 1.0, 1.0, 1.0]], + ), ) - self.assertAllEqual(x["mask_positions"], [1, 2, 4, 5]) - self.assertAllEqual(y, [5, 10, 6, 8]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0]) def test_no_masking_zero_rate(self): no_mask_preprocessor = DebertaV3MaskedLMPreprocessor( - self.preprocessor.tokenizer, + self.tokenizer, mask_selection_rate=0.0, mask_selection_length=4, sequence_length=12, ) - input_data = "the quick brown fox" - - x, y, sw = no_mask_preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [1, 5, 10, 6, 8, 2, 0, 0, 0, 0, 0, 0] + input_data = ["the quick brown fox"] + self.assertAllClose( + no_mask_preprocessor(input_data), + ( + { + "token_ids": [[1, 5, 10, 6, 8, 2, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[0, 0, 0, 0]], + }, + [[0, 0, 0, 0]], + [[0.0, 0.0, 0.0, 0.0]], + ), ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [0, 0, 0, 0]) - self.assertAllEqual(y, [0, 0, 0, 0]) - self.assertAllEqual(sw, [0.0, 0.0, 0.0, 0.0]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DebertaV3MaskedLMPreprocessor.presets: + self.run_preset_test( + cls=DebertaV3MaskedLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py index 3dead2c80d..b103f390f6 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KerasNLP Authors +# Copyright 2023 The KerasNLP Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,14 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os import pytest -import sentencepiece -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone from keras_nlp.models.deberta_v3.deberta_v3_masked_lm import DebertaV3MaskedLM from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import ( @@ -31,28 +27,14 @@ class DebertaV3MaskedLMTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round", "an eagle flew"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=15, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", - user_defined_symbols="[MASK]", - ) - proto = bytes_io.getvalue() + # Setup model. self.preprocessor = DebertaV3MaskedLMPreprocessor( - DebertaV3Tokenizer(proto=proto), + DebertaV3Tokenizer( + # Generated using create_deberta_v3_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" + ) + ), # Simplify our testing by masking every available token. mask_selection_rate=1.0, mask_token_rate=1.0, @@ -66,62 +48,38 @@ def setUp(self): num_heads=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.masked_lm = DebertaV3MaskedLM( - self.backbone, - preprocessor=self.preprocessor, + max_sequence_length=self.preprocessor.sequence_length, ) - - self.raw_batch = [ - "the quick brown fox.", - "the eagle flew over fox.", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.masked_lm(self.preprocessed_batch[0]) - - def test_classifier_predict(self): - self.masked_lm.predict(self.raw_batch) - self.masked_lm.preprocessor = None - self.masked_lm.predict(self.preprocessed_batch[0]) - - def test_classifier_fit(self): - self.masked_lm.fit(self.raw_dataset) - self.masked_lm.preprocessor = None - self.masked_lm.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.masked_lm.preprocessor = None - self.masked_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. ) - self.masked_lm.fit(self.preprocessed_dataset) + self.input_data = self.preprocessor(*self.train_data)[0] - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.masked_lm) - new_classifier = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_classifier.get_config(), - self.masked_lm.get_config(), + def test_masked_lm_basics(self): + self.run_task_test( + cls=DebertaV3MaskedLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 5, 12), ) @pytest.mark.large def test_saved_model(self): - model_output = self.masked_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.masked_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, DebertaV3MaskedLM) + self.run_model_saving_test( + cls=DebertaV3MaskedLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output, atol=0.01, rtol=0.01) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DebertaV3MaskedLM.presets: + self.run_preset_test( + cls=DebertaV3MaskedLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py index dee91dcffa..93f4fbbd22 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py @@ -156,20 +156,28 @@ def __init__( ): super().__init__(**kwargs) self.tokenizer = tokenizer + self.truncate = truncate + self.sequence_length = sequence_length + self.packer = None + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = MultiSegmentPacker( start_value=self.tokenizer.cls_token_id, end_value=self.tokenizer.sep_token_id, pad_value=self.tokenizer.pad_token_id, - truncate=truncate, - sequence_length=sequence_length, + truncate=self.truncate, + sequence_length=self.sequence_length, ) + self.built = True def get_config(self): config = super().get_config() config.update( { - "sequence_length": self.packer.sequence_length, - "truncate": self.packer.truncate, + "sequence_length": self.sequence_length, + "truncate": self.truncate, } ) return config diff --git a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py index 1e95e7988c..a50022f3c7 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import ( DebertaV3Preprocessor, ) @@ -27,116 +25,48 @@ class DebertaV3PreprocessorTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + self.tokenizer = DebertaV3Tokenizer( + # Generated using create_deberta_v3_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" + ) + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ( + ["the quick brown fox"], + [1], # Pass through labels. + [1.0], # Pass through sample_weights. ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - - self.preprocessor = DebertaV3Preprocessor( - tokenizer=DebertaV3Tokenizer(proto=self.proto), - sequence_length=12, - ) - - def test_tokenize_strings(self): - input_data = "the quick brown fox" - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], [1, 5, 10, 6, 8, 2, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual( - output["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] - ) - - def test_tokenize_list_of_strings(self): - # We should handle a list of strings as as batch. - input_data = ["the quick brown fox"] * 4 - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], [[1, 5, 10, 6, 8, 2, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - - def test_tokenize_labeled_batch(self): - x = tf.constant(["the quick brown fox"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - x_out, y_out, sw_out = self.preprocessor(x, y, sw) - self.assertAllEqual( - x_out["token_ids"], [[1, 5, 10, 6, 8, 2, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_labeled_dataset(self): - x = tf.constant(["the quick brown fox"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - ds = tf.data.Dataset.from_tensor_slices((x, y, sw)) - ds = ds.map(self.preprocessor) - x_out, y_out, sw_out = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x_out["token_ids"], [[1, 5, 10, 6, 8, 2, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - def test_tokenize_multiple_sentences(self): - sentence_one = tf.constant("the quick brown fox") - sentence_two = tf.constant("the earth") - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], [1, 5, 10, 6, 8, 2, 5, 7, 2, 0, 0, 0] - ) - self.assertAllEqual( - output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0] - ) - - def test_tokenize_multiple_batched_sentences(self): - sentence_one = tf.constant(["the quick brown fox"] * 4) - sentence_two = tf.constant(["the earth"] * 4) - # The first tuple or list is always interpreted as an enumeration of - # separate sequences to concatenate. - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], [[1, 5, 10, 6, 8, 2, 5, 7, 2, 0, 0, 0]] * 4 - ) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]] * 4 + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=DebertaV3Preprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[1, 5, 10, 6, 8, 2, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ), ) def test_errors_for_2d_list_input(self): + preprocessor = DebertaV3Preprocessor(**self.init_kwargs) ambiguous_input = [["one", "two"], ["three", "four"]] with self.assertRaises(ValueError): - self.preprocessor(ambiguous_input) + preprocessor(ambiguous_input) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DebertaV3Preprocessor.presets: + self.run_preset_test( + cls=DebertaV3Preprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_presets.py b/keras_nlp/models/deberta_v3/deberta_v3_presets.py index f5df6cb599..febfdffd91 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_presets.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_presets.py @@ -25,21 +25,7 @@ "path": "deberta_v3", "model_card": "https://huggingface.co/microsoft/deberta-v3-xsmall", }, - "config": { - "vocabulary_size": 128100, - "num_layers": 12, - "num_heads": 6, - "hidden_dim": 384, - "intermediate_dim": 1536, - "dropout": 0.1, - "max_sequence_length": 512, - "bucket_size": 256, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_extra_small_en/v1/model.h5", - "weights_hash": "d8e10327107e5c5e20b45548a5028619", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_extra_small_en/v1/vocab.spm", - "spm_proto_hash": "1613fcbf3b82999c187b09c9db79b568", + "kaggle_handle": "kaggle://keras/deberta_v3/keras/deberta_v3_extra_small_en/2", }, "deberta_v3_small_en": { "metadata": { @@ -52,21 +38,7 @@ "path": "deberta_v3", "model_card": "https://huggingface.co/microsoft/deberta-v3-small", }, - "config": { - "vocabulary_size": 128100, - "num_layers": 6, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - "bucket_size": 256, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_small_en/v1/model.h5", - "weights_hash": "84118eb7c5a735f2061ecccaf71bb888", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_small_en/v1/vocab.spm", - "spm_proto_hash": "1613fcbf3b82999c187b09c9db79b568", + "kaggle_handle": "kaggle://keras/deberta_v3/keras/deberta_v3_small_en/2", }, "deberta_v3_base_en": { "metadata": { @@ -79,21 +51,7 @@ "path": "deberta_v3", "model_card": "https://huggingface.co/microsoft/deberta-v3-base", }, - "config": { - "vocabulary_size": 128100, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - "bucket_size": 256, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_base_en/v1/model.h5", - "weights_hash": "cebce044aeed36aec9b94e3b8a255430", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_base_en/v1/vocab.spm", - "spm_proto_hash": "1613fcbf3b82999c187b09c9db79b568", + "kaggle_handle": "kaggle://keras/deberta_v3/keras/deberta_v3_base_en/2", }, "deberta_v3_large_en": { "metadata": { @@ -106,21 +64,7 @@ "path": "deberta_v3", "model_card": "https://huggingface.co/microsoft/deberta-v3-large", }, - "config": { - "vocabulary_size": 128100, - "num_layers": 24, - "num_heads": 16, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "dropout": 0.1, - "max_sequence_length": 512, - "bucket_size": 256, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_large_en/v1/model.h5", - "weights_hash": "bce7690f358a9e39304f8c0ebc71a745", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_large_en/v1/vocab.spm", - "spm_proto_hash": "1613fcbf3b82999c187b09c9db79b568", + "kaggle_handle": "kaggle://keras/deberta_v3/keras/deberta_v3_large_en/2", }, "deberta_v3_base_multi": { "metadata": { @@ -133,20 +77,6 @@ "path": "deberta_v3", "model_card": "https://huggingface.co/microsoft/mdeberta-v3-base", }, - "config": { - "vocabulary_size": 251000, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - "bucket_size": 256, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_base_multi/v1/model.h5", - "weights_hash": "26e5a824b26afd2ee336835bd337bbeb", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/deberta_v3_base_multi/v1/vocab.spm", - "spm_proto_hash": "b4ca07289eac48600b29529119d565e2", + "kaggle_handle": "kaggle://keras/deberta_v3/keras/deberta_v3_base_multi/2", }, } diff --git a/keras_nlp/models/deberta_v3/deberta_v3_presets_test.py b/keras_nlp/models/deberta_v3/deberta_v3_presets_test.py deleted file mode 100644 index a033825dad..0000000000 --- a/keras_nlp/models/deberta_v3/deberta_v3_presets_test.py +++ /dev/null @@ -1,202 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from absl.testing import parameterized - -from keras_nlp.backend import ops -from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone -from keras_nlp.models.deberta_v3.deberta_v3_classifier import ( - DebertaV3Classifier, -) -from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import ( - DebertaV3Preprocessor, -) -from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.large -class DebertaV3PresetSmokeTest(TestCase): - """ - A smoke test for DeBERTa presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/deberta/deberta_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = DebertaV3Tokenizer.from_preset( - "deberta_v3_extra_small_en", - ) - outputs = tokenizer("The quick brown fox.") - expected_outputs = [279, 1538, 3258, 16123, 260] - self.assertAllEqual(outputs, expected_outputs) - - def test_preprocessor_output(self): - preprocessor = DebertaV3Preprocessor.from_preset( - "deberta_v3_extra_small_en", - sequence_length=4, - ) - outputs = preprocessor("The quick brown fox.")["token_ids"] - expected_outputs = [1, 279, 1538, 2] - self.assertAllEqual(outputs, expected_outputs) - - def test_preprocessor_mask_token(self): - preprocessor = DebertaV3Preprocessor.from_preset( - "deberta_v3_extra_small_en", - sequence_length=4, - ) - self.assertEqual(preprocessor.tokenizer.id_to_token(128000), "[MASK]") - self.assertEqual(preprocessor.tokenizer.token_to_id("[MASK]"), 128000) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "token_ids": ops.array([[0, 581, 63773, 2]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = DebertaV3Backbone.from_preset( - "deberta_v3_extra_small_en", load_weights=load_weights - ) - outputs = model(input_data) - if load_weights: - outputs = outputs[0, 0, :5] - expected = [0.418, -0.116, -0.122, -1.847, -0.035] - self.assertAllClose(outputs, expected, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_classifier_output(self, load_weights): - input_data = ["The quick brown fox."] - model = DebertaV3Classifier.from_preset( - "deberta_v3_extra_small_en", - num_classes=2, - load_weights=load_weights, - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_classifier_output_without_preprocessing(self, load_weights): - input_data = { - "token_ids": ops.array([[0, 581, 63773, 2]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = DebertaV3Classifier.from_preset( - "deberta_v3_extra_small_en", - num_classes=2, - load_weights=load_weights, - preprocessor=None, - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("deberta_tokenizer", DebertaV3Tokenizer), - ("deberta_preprocessor", DebertaV3Preprocessor), - ("deberta", DebertaV3Backbone), - ("deberta_classifier", DebertaV3Classifier), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("deberta_tokenizer", DebertaV3Tokenizer, {}), - ("deberta_preprocessor", DebertaV3Preprocessor, {}), - ("deberta", DebertaV3Backbone, {}), - ("deberta_classifier", DebertaV3Classifier, {"num_classes": 2}), - ) - def test_unknown_preset_error(self, cls, kwargs): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("deberta_v3_extra_small_en_clowntown", **kwargs) - - -@pytest.mark.extra_large -class DebertaV3PresetFullTest(TestCase): - """ - Test the full enumeration of our preset. - - This tests every DeBERTa preset and is only run manually. - Run with: - `pytest keras_nlp/models/deberta/deberta_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_deberta(self, load_weights): - for preset in DebertaV3Backbone.presets: - model = DebertaV3Backbone.from_preset( - preset, load_weights=load_weights - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), dtype="int64", maxval=model.vocabulary_size - ), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - model(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_deberta_classifier(self, load_weights): - for preset in DebertaV3Classifier.presets: - classifier = DebertaV3Classifier.from_preset( - preset, - num_classes=4, - load_weights=load_weights, - ) - input_data = ["The quick brown fox."] - classifier.predict(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_deberta_classifier_without_preprocessing(self, load_weights): - for preset in DebertaV3Classifier.presets: - classifier = DebertaV3Classifier.from_preset( - preset, - num_classes=4, - load_weights=load_weights, - preprocessor=None, - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), - dtype="int64", - maxval=classifier.backbone.vocabulary_size, - ), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - classifier.predict(input_data) - - def test_load_tokenizers(self): - for preset in DebertaV3Tokenizer.presets: - tokenizer = DebertaV3Tokenizer.from_preset(preset) - tokenizer("The quick brown fox.") - - def test_load_preprocessors(self): - for preset in DebertaV3Preprocessor.presets: - preprocessor = DebertaV3Preprocessor.from_preset(preset) - preprocessor("The quick brown fox.") diff --git a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py index 03c9cd5821..e66c373e65 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py @@ -93,33 +93,38 @@ class DebertaV3Tokenizer(SentencePieceTokenizer): """ def __init__(self, proto, **kwargs): + self.cls_token = "[CLS]" + self.sep_token = "[SEP]" + self.pad_token = "[PAD]" + self.mask_token = "[MASK]" + super().__init__(proto=proto, **kwargs) - # Check for necessary special tokens. - cls_token = "[CLS]" - sep_token = "[SEP]" - pad_token = "[PAD]" - mask_token = "[MASK]" - - # We do not throw an error if `mask_token` is not present in the - # vocabulary. - for token in [cls_token, pad_token, sep_token]: - if token not in super().get_vocabulary(): - raise ValueError( - f"Cannot find token `'{token}'` in the provided " - f"`vocabulary`. Please provide `'{token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.cls_token_id = self.token_to_id(cls_token) - self.sep_token_id = self.token_to_id(sep_token) - self.pad_token_id = self.token_to_id(pad_token) - # If the mask token is not in the vocabulary, add it to the end of the - # vocabulary. - if mask_token in super().get_vocabulary(): - self.mask_token_id = super().token_to_id(mask_token) + def set_proto(self, proto): + super().set_proto(proto) + if proto is not None: + for token in [self.cls_token, self.pad_token, self.sep_token]: + if token not in super().get_vocabulary(): + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.cls_token_id = self.token_to_id(self.cls_token) + self.sep_token_id = self.token_to_id(self.sep_token) + self.pad_token_id = self.token_to_id(self.pad_token) + # If the mask token is not in the vocabulary, add it to the end of the + # vocabulary. + if self.mask_token in super().get_vocabulary(): + self.mask_token_id = super().token_to_id(self.mask_token) + else: + self.mask_token_id = super().vocabulary_size() else: - self.mask_token_id = super().vocabulary_size() + self.cls_token_id = None + self.sep_token_id = None + self.pad_token_id = None + self.mask_token_id = None def vocabulary_size(self): sentence_piece_size = super().vocabulary_size() diff --git a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py index d9a0708b9d..3c17cfa397 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py @@ -12,89 +12,64 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer from keras_nlp.tests.test_case import TestCase class DebertaV3TokenizerTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + # Generated using create_deberta_v3_test_proto.py + proto = os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=10, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", + self.tokenizer = DebertaV3Tokenizer(proto=proto) + self.init_kwargs = {"proto": proto} + self.input_data = ["the quick brown fox", "the earth is round"] + + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=DebertaV3Tokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]], ) - self.proto = bytes_io.getvalue() - - self.tokenizer = DebertaV3Tokenizer(proto=self.proto) - - def test_tokenize(self): - input_data = "the quick brown fox" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [4, 9, 5, 7]) - - def test_tokenize_batch(self): - input_data = ["the quick brown fox", "the earth is round"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[4, 9, 5, 7], [4, 6, 8, 3]]) - - def test_detokenize(self): - input_data = [[4, 9, 5, 7]] - output = self.tokenizer.detokenize(input_data) - self.assertEqual(output, ["the quick brown fox"]) - - def test_detokenize_mask_token(self): - input_data = [[4, 9, 5, 7, self.tokenizer.mask_token_id]] - output = self.tokenizer.detokenize(input_data) - self.assertEqual(output, ["the quick brown fox"]) - - def test_vocabulary_size(self): - self.assertEqual(self.tokenizer.vocabulary_size(), 11) - - def test_get_vocabulary_mask_token(self): - self.assertEqual(self.tokenizer.get_vocabulary()[10], "[MASK]") - - def test_id_to_token_mask_token(self): - self.assertEqual(self.tokenizer.id_to_token(10), "[MASK]") - - def test_token_to_id_mask_token(self): - self.assertEqual(self.tokenizer.token_to_id("[MASK]"), 10) def test_errors_missing_special_tokens(self): - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(["abc"]), - model_writer=bytes_io, - vocab_size=5, - pad_id=-1, - eos_id=-1, - bos_id=-1, - ) with self.assertRaises(ValueError): - DebertaV3Tokenizer(proto=bytes_io.getvalue()) + DebertaV3Tokenizer( + # Generated using create_no_special_token_proto.py + proto=os.path.join( + self.get_test_data_dir(), "no_special_token_vocab.spm" + ) + ) + + def test_mask_token_handling(self): + tokenizer = DebertaV3Tokenizer(**self.init_kwargs) + self.assertEqual(tokenizer.get_vocabulary()[4], "[MASK]") + self.assertEqual(tokenizer.id_to_token(4), "[MASK]") + self.assertEqual(tokenizer.token_to_id("[MASK]"), 4) + input_data = [[5, 10, 6, 8, self.tokenizer.mask_token_id]] + output = tokenizer.detokenize(input_data) + self.assertEqual(output, ["the quick brown fox"]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=DebertaV3Tokenizer, + preset="deberta_v3_extra_small_en", + input_data=["The quick brown fox."], + expected_output=[[279, 1538, 3258, 16123, 260]], ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DebertaV3Tokenizer.presets: + self.run_preset_test( + cls=DebertaV3Tokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/deberta_v3/disentangled_self_attention.py b/keras_nlp/models/deberta_v3/disentangled_self_attention.py index 421ba91d8e..1c9ae569c7 100644 --- a/keras_nlp/models/deberta_v3/disentangled_self_attention.py +++ b/keras_nlp/models/deberta_v3/disentangled_self_attention.py @@ -232,12 +232,13 @@ def _get_log_pos(abs_pos, mid): x1=rel_pos, x2=log_pos * sign, ) - bucket_pos = ops.cast(bucket_pos, dtype="int64") + bucket_pos = ops.cast(bucket_pos, dtype="int") return bucket_pos def _get_rel_pos(self, num_positions): - ids = ops.arange(num_positions, dtype="int64") + ids = ops.arange(num_positions) + ids = ops.cast(ids, dtype="int") query_ids = ops.expand_dims(ids, axis=-1) key_ids = ops.expand_dims(ids, axis=0) key_ids = ops.repeat(key_ids, repeats=num_positions, axis=0) diff --git a/keras_nlp/models/distil_bert/distil_bert_backbone_test.py b/keras_nlp/models/distil_bert/distil_bert_backbone_test.py index 897e7ffc26..8790f87e93 100644 --- a/keras_nlp/models/distil_bert/distil_bert_backbone_test.py +++ b/keras_nlp/models/distil_bert/distil_bert_backbone_test.py @@ -12,100 +12,67 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.models.distil_bert.distil_bert_backbone import DistilBertBackbone from keras_nlp.tests.test_case import TestCase -class DistilBertTest(TestCase): +class DistilBertBackboneTest(TestCase): def setUp(self): - self.backbone = DistilBertBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - name="encoder", - ) - - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "segment_ids": ops.zeros((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_distilbert(self): - self.backbone(self.input_batch) - - def test_token_embedding(self): - output = self.backbone.token_embedding(self.input_batch["token_ids"]) - self.assertEqual(output.shape, (2, 5, 2)) - - def test_variable_sequence_length_call_distilbert(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "mask_positions": np.ones((2, seq_length), dtype="int32"), - "padding_mask": np.ones((2, seq_length), dtype="int32"), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=DistilBertBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 2), ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, DistilBertBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose(model_output, restored_output) + self.run_model_saving_test( + cls=DistilBertBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=DistilBertBackbone, + preset="distil_bert_base_en_uncased", + input_data={ + "token_ids": ops.array([[101, 1996, 4248, 102]], dtype="int32"), + "segment_ids": ops.zeros((1, 4), dtype="int32"), + "padding_mask": ops.ones((1, 4), dtype="int32"), + }, + expected_output_shape=(1, 4, 768), + # The forward pass from a preset should be stable! + expected_partial_output=ops.array( + [-0.2381, -0.1965, 0.1053, -0.0847, -0.145], + ), + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class DistilBertTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = DistilBertBackbone( - vocabulary_size=1000, - num_layers=2, - num_heads=2, - hidden_dim=64, - intermediate_dim=128, - max_sequence_length=128, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DistilBertBackbone.presets: + self.run_preset_test( + cls=DistilBertBackbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "token_ids": np.ones((8, 128), dtype="int32"), - "padding_mask": np.ones((8, 128), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) diff --git a/keras_nlp/models/distil_bert/distil_bert_classifier.py b/keras_nlp/models/distil_bert/distil_bert_classifier.py index 770bf5e02b..42de1cee83 100644 --- a/keras_nlp/models/distil_bert/distil_bert_classifier.py +++ b/keras_nlp/models/distil_bert/distil_bert_classifier.py @@ -137,6 +137,7 @@ class DistilBertClassifier(Task): num_classes=4, ) classifier.fit(x=features, y=labels, batch_size=2) + ``` """ def __init__( diff --git a/keras_nlp/models/distil_bert/distil_bert_classifier_test.py b/keras_nlp/models/distil_bert/distil_bert_classifier_test.py index 6ca36d1692..d25f176894 100644 --- a/keras_nlp/models/distil_bert/distil_bert_classifier_test.py +++ b/keras_nlp/models/distil_bert/distil_bert_classifier_test.py @@ -12,14 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras -from keras_nlp.backend import ops from keras_nlp.models.distil_bert.distil_bert_backbone import DistilBertBackbone from keras_nlp.models.distil_bert.distil_bert_classifier import ( DistilBertClassifier, @@ -35,13 +29,12 @@ class DistilBertClassifierTest(TestCase): def setUp(self): - # Setup model - + # Setup model. self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.vocab += ["the", "quick", "brown", "fox", "."] self.preprocessor = DistilBertPreprocessor( DistilBertTokenizer(vocabulary=self.vocab), - sequence_length=8, + sequence_length=5, ) self.backbone = DistilBertBackbone( vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(), @@ -49,85 +42,42 @@ def setUp(self): num_heads=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, + max_sequence_length=self.preprocessor.sequence_length, ) - self.classifier = DistilBertClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - # Check we handle serialization correctly. - activation=keras.activations.softmax, - hidden_dim=4, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + "num_classes": 2, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. + [1, 0], # Labels. ) + self.input_data = self.preprocessor(*self.train_data)[0] - self.raw_batch = [ - "the quick brown fox.", - "the slow brown fox.", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - (self.raw_batch, np.ones((2,))) - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.classifier(self.preprocessed_batch) - - def test_classifier_predict(self): - preds1 = self.classifier.predict(self.raw_batch) - self.classifier.preprocessor = None - preds2 = self.classifier.predict(self.preprocessed_batch) - # Assert predictions match. - self.assertAllClose(preds1, preds2) - # Assert valid softmax output. - self.assertAllClose(ops.sum(preds2, axis=-1), [1.0, 1.0]) - - def test_classifier_fit(self): - self.classifier.fit(self.raw_dataset) - self.classifier.preprocessor = None - self.classifier.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.classifier.preprocessor = None - self.classifier.compile( - loss="sparse_categorical_crossentropy", - jit_compile=False, + def test_classifier_basics(self): + self.run_task_test( + cls=DistilBertClassifier, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 2), ) - self.classifier.fit(self.preprocessed_dataset) - - def test_serialization(self): - # Defaults. - original = DistilBertClassifier( - self.backbone, - num_classes=2, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) - # With options. - original = DistilBertClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - activation=keras.activations.softmax, - hidden_dim=4, - name="test", - trainable=False, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) @pytest.mark.large - def test_saving_model(self): - model_output = self.classifier.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.classifier.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, DistilBertClassifier) + def test_saved_model(self): + self.run_model_saving_test( + cls=DistilBertClassifier, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DistilBertClassifier.presets: + self.run_preset_test( + cls=DistilBertClassifier, + preset=preset, + init_kwargs={"num_classes": 2}, + input_data=self.input_data, + expected_output_shape=(2, 2), + ) diff --git a/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor.py b/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor.py index 3fcf9bced1..f1360f58b7 100644 --- a/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor.py +++ b/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor.py @@ -136,33 +136,30 @@ def __init__( truncate=truncate, **kwargs, ) - + self.mask_selection_rate = mask_selection_rate + self.mask_selection_length = mask_selection_length + self.mask_token_rate = mask_token_rate + self.random_token_rate = random_token_rate + self.masker = None + + def build(self, input_shape): + super().build(input_shape) + # Defer masker creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.masker = MaskedLMMaskGenerator( - mask_selection_rate=mask_selection_rate, - mask_selection_length=mask_selection_length, - mask_token_rate=mask_token_rate, - random_token_rate=random_token_rate, - vocabulary_size=tokenizer.vocabulary_size(), - mask_token_id=tokenizer.mask_token_id, + mask_selection_rate=self.mask_selection_rate, + mask_selection_length=self.mask_selection_length, + mask_token_rate=self.mask_token_rate, + random_token_rate=self.random_token_rate, + vocabulary_size=self.tokenizer.vocabulary_size(), + mask_token_id=self.tokenizer.mask_token_id, unselectable_token_ids=[ - tokenizer.cls_token_id, - tokenizer.sep_token_id, - tokenizer.pad_token_id, + self.tokenizer.cls_token_id, + self.tokenizer.sep_token_id, + self.tokenizer.pad_token_id, ], ) - def get_config(self): - config = super().get_config() - config.update( - { - "mask_selection_rate": self.masker.mask_selection_rate, - "mask_selection_length": self.masker.mask_selection_length, - "mask_token_rate": self.masker.mask_token_rate, - "random_token_rate": self.masker.random_token_rate, - } - ) - return config - def call(self, x, y=None, sample_weight=None): if y is not None or sample_weight is not None: logging.warning( @@ -183,3 +180,15 @@ def call(self, x, y=None, sample_weight=None): y = masker_outputs["mask_ids"] sample_weight = masker_outputs["mask_weights"] return pack_x_y_sample_weight(x, y, sample_weight) + + def get_config(self): + config = super().get_config() + config.update( + { + "mask_selection_rate": self.mask_selection_rate, + "mask_selection_length": self.mask_selection_length, + "mask_token_rate": self.mask_token_rate, + "random_token_rate": self.random_token_rate, + } + ) + return config diff --git a/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor_test.py b/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor_test.py index 091ae77262..b01b1da8ac 100644 --- a/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor_test.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KerasNLP Authors +# Copyright 2023 The KerasNLP Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.distil_bert.distil_bert_masked_lm_preprocessor import ( DistilBertMaskedLMPreprocessor, ) @@ -29,81 +28,60 @@ def setUp(self): self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.vocab += ["THE", "QUICK", "BROWN", "FOX"] self.vocab += ["the", "quick", "brown", "fox"] + self.tokenizer = DistilBertTokenizer(vocabulary=self.vocab) + self.init_kwargs = { + "tokenizer": self.tokenizer, + # Simplify our testing by masking every available token. + "mask_selection_rate": 1.0, + "mask_token_rate": 1.0, + "random_token_rate": 0.0, + "mask_selection_length": 4, + "sequence_length": 12, + } + self.input_data = ["the quick brown fox"] - self.preprocessor = DistilBertMaskedLMPreprocessor( - tokenizer=DistilBertTokenizer( - vocabulary=self.vocab, + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=DistilBertMaskedLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[1, 2, 3, 4]], + }, + [[9, 10, 11, 12]], + [[1.0, 1.0, 1.0, 1.0]], ), - # Simplify our testing by masking every available token. - mask_selection_rate=1.0, - mask_token_rate=1.0, - random_token_rate=0.0, - mask_selection_length=5, - sequence_length=8, ) - def test_preprocess_strings(self): - input_data = " THE QUICK BROWN FOX." - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [2, 4, 4, 4, 4, 4, 3, 0]) - self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - self.assertAllEqual(x["mask_positions"], [1, 2, 3, 4, 5]) - self.assertAllEqual(y, [5, 6, 7, 8, 1]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0, 1.0]) - - def test_preprocess_list_of_strings(self): - input_data = [" THE QUICK BROWN FOX."] * 4 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[2, 4, 4, 4, 4, 4, 3, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4, 5]] * 4) - self.assertAllEqual(y, [[5, 6, 7, 8, 1]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0, 1.0]] * 4) - - def test_preprocess_dataset(self): - sentences = tf.constant([" THE QUICK BROWN FOX."] * 4) - ds = tf.data.Dataset.from_tensor_slices(sentences) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(4).take(1).get_single_element() - self.assertAllEqual(x["token_ids"], [[2, 4, 4, 4, 4, 4, 3, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4, 5]] * 4) - self.assertAllEqual(y, [[5, 6, 7, 8, 1]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0, 1.0]] * 4) - - def test_mask_multiple_sentences(self): - sentence_one = tf.constant(" THE QUICK") - sentence_two = tf.constant(" BROWN FOX.") - - x, y, sw = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual(x["token_ids"], [2, 4, 4, 3, 4, 4, 4, 3]) - self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1]) - self.assertAllEqual(x["mask_positions"], [1, 2, 4, 5, 6]) - self.assertAllEqual(y, [5, 6, 7, 8, 1]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0, 1.0]) - def test_no_masking_zero_rate(self): no_mask_preprocessor = DistilBertMaskedLMPreprocessor( - self.preprocessor.tokenizer, + self.tokenizer, mask_selection_rate=0.0, - mask_selection_length=5, - sequence_length=8, + mask_selection_length=4, + sequence_length=12, ) - input_data = " THE QUICK BROWN FOX." - - x, y, sw = no_mask_preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [2, 5, 6, 7, 8, 1, 3, 0]) - self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - self.assertAllEqual(x["mask_positions"], [0, 0, 0, 0, 0]) - self.assertAllEqual(y, [0, 0, 0, 0, 0]) - self.assertAllEqual(sw, [0.0, 0.0, 0.0, 0.0, 0.0]) - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), + input_data = ["the quick brown fox"] + self.assertAllClose( + no_mask_preprocessor(input_data), + ( + { + "token_ids": [[2, 9, 10, 11, 12, 3, 0, 0, 0, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[0, 0, 0, 0]], + }, + [[0, 0, 0, 0]], + [[0.0, 0.0, 0.0, 0.0]], + ), ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DistilBertMaskedLMPreprocessor.presets: + self.run_preset_test( + cls=DistilBertMaskedLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/distil_bert/distil_bert_masked_lm_test.py b/keras_nlp/models/distil_bert/distil_bert_masked_lm_test.py index e43feb32f1..4aa8327ae7 100644 --- a/keras_nlp/models/distil_bert/distil_bert_masked_lm_test.py +++ b/keras_nlp/models/distil_bert/distil_bert_masked_lm_test.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KerasNLP Authors +# Copyright 2023 The KerasNLP Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,12 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.distil_bert.distil_bert_backbone import DistilBertBackbone from keras_nlp.models.distil_bert.distil_bert_masked_lm import ( DistilBertMaskedLM, @@ -49,59 +45,40 @@ def setUp(self): vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(), num_layers=2, num_heads=2, - hidden_dim=4, + hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, + max_sequence_length=self.preprocessor.sequence_length, ) - self.masked_lm = DistilBertMaskedLM( - self.backbone, - preprocessor=self.preprocessor, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. ) + self.input_data = self.preprocessor(*self.train_data)[0] - self.raw_batch = [ - "the quick brown fox.", - "the slow brown fox.", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.masked_lm(self.preprocessed_batch[0]) - - def test_distil_bert_masked_lm_fit_default_compile(self): - self.masked_lm.fit(self.raw_dataset) - - def test_classifier_predict(self): - self.masked_lm.predict(self.raw_batch) - self.masked_lm.preprocessor = None - self.masked_lm.predict(self.preprocessed_batch[0]) - - def test_classifier_fit(self): - self.masked_lm.fit(self.raw_dataset) - self.masked_lm.preprocessor = None - self.masked_lm.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.masked_lm.preprocessor = None - self.masked_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + def test_masked_lm_basics(self): + self.run_task_test( + cls=DistilBertMaskedLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 5, 10), ) - self.masked_lm.fit(self.preprocessed_dataset) @pytest.mark.large def test_saved_model(self): - model_output = self.masked_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.masked_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, DistilBertMaskedLM) + self.run_model_saving_test( + cls=DistilBertMaskedLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output, atol=0.01, rtol=0.01) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DistilBertMaskedLM.presets: + self.run_preset_test( + cls=DistilBertMaskedLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py index f2c4326234..107275f80a 100644 --- a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py +++ b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py @@ -127,24 +127,21 @@ def __init__( ): super().__init__(**kwargs) self.tokenizer = tokenizer + self.sequence_length = sequence_length + self.truncate = truncate + + def build(self, input_shape): + super().build(input_shape) + # Defer masker creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = MultiSegmentPacker( start_value=self.tokenizer.cls_token_id, end_value=self.tokenizer.sep_token_id, pad_value=self.tokenizer.pad_token_id, - truncate=truncate, - sequence_length=sequence_length, + truncate=self.truncate, + sequence_length=self.sequence_length, ) - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.packer.sequence_length, - "truncate": self.packer.truncate, - } - ) - return config - def call(self, x, y=None, sample_weight=None): x = convert_inputs_to_list_of_tensor_segments(x) x = [self.tokenizer(segment) for segment in x] @@ -155,6 +152,16 @@ def call(self, x, y=None, sample_weight=None): } return pack_x_y_sample_weight(x, y, sample_weight) + def get_config(self): + config = super().get_config() + config.update( + { + "sequence_length": self.sequence_length, + "truncate": self.truncate, + } + ) + return config + @classproperty def tokenizer_cls(cls): return DistilBertTokenizer diff --git a/keras_nlp/models/distil_bert/distil_bert_preprocessor_test.py b/keras_nlp/models/distil_bert/distil_bert_preprocessor_test.py index 77176e4c15..22d69c88dc 100644 --- a/keras_nlp/models/distil_bert/distil_bert_preprocessor_test.py +++ b/keras_nlp/models/distil_bert/distil_bert_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.distil_bert.distil_bert_preprocessor import ( DistilBertPreprocessor, ) @@ -29,79 +28,43 @@ def setUp(self): self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.vocab += ["THE", "QUICK", "BROWN", "FOX"] self.vocab += ["the", "quick", "brown", "fox"] - self.preprocessor = DistilBertPreprocessor( - DistilBertTokenizer(vocabulary=self.vocab), - sequence_length=8, + self.tokenizer = DistilBertTokenizer(vocabulary=self.vocab) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ( + ["THE QUICK BROWN FOX."], + [1], # Pass through labels. + [1.0], # Pass through sample_weights. ) - def test_tokenize_strings(self): - input_data = "THE QUICK BROWN FOX." - output = self.preprocessor(input_data) - self.assertAllEqual(output["token_ids"], [2, 5, 6, 7, 8, 1, 3, 0]) - self.assertAllEqual(output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - - def test_tokenize_list_of_strings(self): - # We should handle a list of strings as as batch. - input_data = ["THE QUICK BROWN FOX."] * 4 - output = self.preprocessor(input_data) - self.assertAllEqual(output["token_ids"], [[2, 5, 6, 7, 8, 1, 3, 0]] * 4) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - - def test_tokenize_labeled_batch(self): - x = tf.constant(["THE QUICK BROWN FOX."] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - x_out, y_out, sw_out = self.preprocessor(x, y, sw) - self.assertAllEqual(x_out["token_ids"], [[2, 5, 6, 7, 8, 1, 3, 0]] * 4) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_labeled_dataset(self): - x = tf.constant(["THE QUICK BROWN FOX."] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - ds = tf.data.Dataset.from_tensor_slices((x, y, sw)) - ds = ds.map(self.preprocessor) - x_out, y_out, sw_out = ds.batch(4).take(1).get_single_element() - self.assertAllEqual(x_out["token_ids"], [[2, 5, 6, 7, 8, 1, 3, 0]] * 4) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_multiple_sentences(self): - sentence_one = tf.constant("THE QUICK") - sentence_two = tf.constant("BROWN FOX.") - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual(output["token_ids"], [2, 5, 6, 3, 7, 8, 1, 3]) - self.assertAllEqual(output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1]) - - def test_tokenize_multiple_batched_sentences(self): - sentence_one = tf.constant(["THE QUICK"] * 4) - sentence_two = tf.constant(["BROWN FOX."] * 4) - # The first tuple or list is always interpreted as an enumeration of - # separate sequences to concatenate. - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual(output["token_ids"], [[2, 5, 6, 3, 7, 8, 1, 3]] * 4) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1]] * 4 + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=DistilBertPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[2, 5, 6, 7, 8, 1, 3, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ), ) def test_errors_for_2d_list_input(self): + preprocessor = DistilBertPreprocessor(**self.init_kwargs) ambiguous_input = [["one", "two"], ["three", "four"]] with self.assertRaises(ValueError): - self.preprocessor(ambiguous_input) + preprocessor(ambiguous_input) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DistilBertPreprocessor.presets: + self.run_preset_test( + cls=DistilBertPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/distil_bert/distil_bert_presets.py b/keras_nlp/models/distil_bert/distil_bert_presets.py index 3f939fb6da..2bc3415342 100644 --- a/keras_nlp/models/distil_bert/distil_bert_presets.py +++ b/keras_nlp/models/distil_bert/distil_bert_presets.py @@ -26,22 +26,7 @@ "path": "distil_bert", "model_card": "https://huggingface.co/distilbert-base-uncased", }, - "config": { - "vocabulary_size": 30522, - "num_layers": 6, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - }, - "preprocessor_config": { - "lowercase": True, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en_uncased/v1/model.h5", - "weights_hash": "6625a649572e74086d74c46b8d0b0da3", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en_uncased/v1/vocab.txt", - "vocabulary_hash": "64800d5d8528ce344256daf115d4965e", + "kaggle_handle": "kaggle://keras/distil_bert/keras/distil_bert_base_en_uncased/2", }, "distil_bert_base_en": { "metadata": { @@ -55,22 +40,7 @@ "path": "distil_bert", "model_card": "https://huggingface.co/distilbert-base-cased", }, - "config": { - "vocabulary_size": 28996, - "num_layers": 6, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - }, - "preprocessor_config": { - "lowercase": False, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en/v1/model.h5", - "weights_hash": "fa36aa6865978efbf85a5c8264e5eb57", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/distil_bert_base_en/v1/vocab.txt", - "vocabulary_hash": "bb6ca9b42e790e5cd986bbb16444d0e0", + "kaggle_handle": "kaggle://keras/distil_bert/keras/distil_bert_base_en/2", }, "distil_bert_base_multi": { "metadata": { @@ -82,21 +52,6 @@ "path": "distil_bert", "model_card": "https://huggingface.co/distilbert-base-multilingual-cased", }, - "config": { - "vocabulary_size": 119547, - "num_layers": 6, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - }, - "preprocessor_config": { - "lowercase": False, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/distil_bert_base_multi/v1/model.h5", - "weights_hash": "c0f11095e2a6455bd3b1a6d14800a7fa", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/distil_bert_base_multi/v1/vocab.txt", - "vocabulary_hash": "d9d865138d17f1958502ed060ecfeeb6", + "kaggle_handle": "kaggle://keras/distil_bert/keras/distil_bert_base_multi/2", }, } diff --git a/keras_nlp/models/distil_bert/distil_bert_presets_test.py b/keras_nlp/models/distil_bert/distil_bert_presets_test.py deleted file mode 100644 index a974f54aee..0000000000 --- a/keras_nlp/models/distil_bert/distil_bert_presets_test.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from absl.testing import parameterized - -from keras_nlp.backend import ops -from keras_nlp.models.distil_bert.distil_bert_backbone import DistilBertBackbone -from keras_nlp.models.distil_bert.distil_bert_classifier import ( - DistilBertClassifier, -) -from keras_nlp.models.distil_bert.distil_bert_preprocessor import ( - DistilBertPreprocessor, -) -from keras_nlp.models.distil_bert.distil_bert_tokenizer import ( - DistilBertTokenizer, -) -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.large -class DistilBertPresetSmokeTest(TestCase): - """ - A smoke test for DistilBERT presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/distilbert/distilbert_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = DistilBertTokenizer.from_preset( - "distil_bert_base_en_uncased", - ) - outputs = tokenizer("The quick brown fox.") - expected_outputs = [1996, 4248, 2829, 4419, 1012] - self.assertAllEqual(outputs, expected_outputs) - - def test_preprocessor_output(self): - tokenizer = DistilBertPreprocessor.from_preset( - "distil_bert_base_en_uncased", - sequence_length=4, - ) - outputs = tokenizer("The quick brown fox.")["token_ids"] - expected_outputs = [101, 1996, 4248, 102] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "token_ids": ops.array([[101, 1996, 4248, 102]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = DistilBertBackbone.from_preset( - "distil_bert_base_en_uncased", load_weights=load_weights - ) - outputs = model(input_data)[0, 0, :5] - if load_weights: - expected_outputs = [-0.2381, -0.1965, 0.1053, -0.0847, -0.145] - self.assertAllClose(outputs, expected_outputs, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_classifier_output(self, load_weights): - input_data = ["The quick brown fox."] - model = DistilBertClassifier.from_preset( - "distil_bert_base_en_uncased", - num_classes=2, - load_weights=load_weights, - ) - model.predict(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_classifier_output_without_preprocessing(self, load_weights): - input_data = { - "token_ids": ops.array([[101, 1996, 4248, 102]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = DistilBertClassifier.from_preset( - "distil_bert_base_en_uncased", - num_classes=2, - load_weights=load_weights, - preprocessor=None, - ) - model.predict(input_data) - - @parameterized.named_parameters( - ("distilbert_tokenizer", DistilBertTokenizer), - ("distilbert_preprocessor", DistilBertPreprocessor), - ("distilbert", DistilBertBackbone), - ("distilbert_classifier", DistilBertClassifier), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("distilbert_tokenizer", DistilBertTokenizer, {}), - ("distilbert_preprocessor", DistilBertPreprocessor, {}), - ("distilbert", DistilBertBackbone, {}), - ("distilbert_classifier", DistilBertClassifier, {"num_classes": 2}), - ) - def test_unknown_preset_error(self, cls, kwargs): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("distilbert_base_uncased", **kwargs) - - -@pytest.mark.extra_large -class DistilBertPresetFullTest(TestCase): - """ - Tests the full enumeration of our preset. - - This tests every DistilBERT preset and is only run manually. - Run with: - `pytest keras_nlp/models/distilbert/distilbert_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_distilbert(self, load_weights): - for preset in DistilBertBackbone.presets: - model = DistilBertBackbone.from_preset( - preset, load_weights=load_weights - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), dtype="int64", maxval=model.vocabulary_size - ), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - model(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_distilbert_classifier(self, load_weights): - for preset in DistilBertClassifier.presets: - classifier = DistilBertClassifier.from_preset( - preset, - num_classes=2, - load_weights=load_weights, - ) - input_data = ["This quick brown fox."] - classifier.predict(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_distilbert_classifier_no_preprocessing(self, load_weights): - for preset in DistilBertClassifier.presets: - classifier = DistilBertClassifier.from_preset( - preset, - num_classes=2, - load_weights=load_weights, - preprocessor=None, - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), - dtype="int64", - maxval=classifier.backbone.vocabulary_size, - ), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - classifier.predict(input_data) - - def test_load_tokenizers(self): - for preset in DistilBertTokenizer.presets: - tokenizer = DistilBertTokenizer.from_preset(preset) - tokenizer("The quick brown fox.") - - def test_load_preprocessors(self): - for preset in DistilBertPreprocessor.presets: - preprocessor = DistilBertPreprocessor.from_preset(preset) - preprocessor("The quick brown fox.") diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py index 9e344a378b..4a18398a1e 100644 --- a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py +++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py @@ -76,29 +76,38 @@ def __init__( lowercase=False, **kwargs, ): + self.cls_token = "[CLS]" + self.sep_token = "[SEP]" + self.pad_token = "[PAD]" + self.mask_token = "[MASK]" super().__init__( vocabulary=vocabulary, lowercase=lowercase, **kwargs, ) - # Check for necessary special tokens. - cls_token = "[CLS]" - sep_token = "[SEP]" - pad_token = "[PAD]" - mask_token = "[MASK]" - for token in [cls_token, pad_token, sep_token, mask_token]: - if token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{token}'` in the provided " - f"`vocabulary`. Please provide `'{token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.cls_token_id = self.token_to_id(cls_token) - self.sep_token_id = self.token_to_id(sep_token) - self.pad_token_id = self.token_to_id(pad_token) - self.mask_token_id = self.token_to_id(mask_token) + def set_vocabulary(self, vocabulary): + super().set_vocabulary(vocabulary) + + if vocabulary is not None: + # Check for necessary special tokens. + for token in [self.cls_token, self.pad_token, self.sep_token]: + if token not in self.vocabulary: + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.cls_token_id = self.token_to_id(self.cls_token) + self.sep_token_id = self.token_to_id(self.sep_token) + self.pad_token_id = self.token_to_id(self.pad_token) + self.mask_token_id = self.token_to_id(self.mask_token) + else: + self.cls_token_id = None + self.sep_token_id = None + self.pad_token_id = None + self.mask_token_id = None @classproperty def presets(cls): diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py index db85435f1c..e4bfba41d3 100644 --- a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py +++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend import keras +import pytest + from keras_nlp.models.distil_bert.distil_bert_tokenizer import ( DistilBertTokenizer, ) @@ -24,40 +25,40 @@ def setUp(self): self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.vocab += ["THE", "QUICK", "BROWN", "FOX"] self.vocab += ["the", "quick", "brown", "fox"] - self.tokenizer = DistilBertTokenizer(vocabulary=self.vocab) - - def test_tokenize(self): - input_data = "THE QUICK BROWN FOX." - output = self.tokenizer(input_data) - self.assertAllEqual(output, [5, 6, 7, 8, 1]) + self.init_kwargs = {"vocabulary": self.vocab} + self.input_data = ["THE QUICK BROWN FOX", "THE FOX"] - def test_tokenize_batch(self): - input_data = ["THE QUICK BROWN FOX.", "THE FOX."] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[5, 6, 7, 8, 1], [5, 8, 1]]) + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=DistilBertTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[5, 6, 7, 8], [5, 8]], + ) def test_lowercase(self): - input_data = "THE QUICK BROWN FOX." tokenizer = DistilBertTokenizer(vocabulary=self.vocab, lowercase=True) - output = tokenizer(input_data) - self.assertAllEqual(output, [9, 10, 11, 12, 1]) - - def test_detokenize(self): - input_tokens = [[5, 6, 7, 8]] - output = self.tokenizer.detokenize(input_tokens) - self.assertAllEqual(output, ["THE QUICK BROWN FOX"]) - - def test_vocabulary_size(self): - self.assertEqual(self.tokenizer.vocabulary_size(), 13) + output = tokenizer(self.input_data) + self.assertAllEqual(output, [[9, 10, 11, 12], [9, 12]]) def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): DistilBertTokenizer(vocabulary=["a", "b", "c"]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=DistilBertTokenizer, + preset="distil_bert_base_en_uncased", + input_data=["The quick brown fox."], + expected_output=[[1996, 4248, 2829, 4419, 1012]], ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in DistilBertTokenizer.presets: + self.run_preset_test( + cls=DistilBertTokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/electra/__init__.py b/keras_nlp/models/electra/__init__.py new file mode 100644 index 0000000000..ba0c2545e4 --- /dev/null +++ b/keras_nlp/models/electra/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/keras_nlp/models/electra/electra_backbone.py b/keras_nlp/models/electra/electra_backbone.py new file mode 100644 index 0000000000..f5f547bb77 --- /dev/null +++ b/keras_nlp/models/electra/electra_backbone.py @@ -0,0 +1,216 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from keras_nlp.backend import keras +from keras_nlp.layers.modeling.position_embedding import PositionEmbedding +from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding +from keras_nlp.layers.modeling.transformer_encoder import TransformerEncoder +from keras_nlp.models.backbone import Backbone +from keras_nlp.utils.keras_utils import gelu_approximate + + +def electra_kernel_initializer(stddev=0.02): + return keras.initializers.TruncatedNormal(stddev=stddev) + + +@keras.saving.register_keras_serializable(package="keras_nlp") +class ElectraBackbone(Backbone): + """A Electra encoder network. + + This network implements a bidirectional Transformer-based encoder as + described in ["Electra: Pre-training Text Encoders as Discriminators Rather + Than Generators"](https://arxiv.org/abs/2003.10555). It includes the + embedding lookups and transformer layers, but not the masked language model + or classification task networks. + + The default constructor gives a fully customizable, randomly initialized + Electra encoder with any number of layers, heads, and embedding + dimensions. + + Disclaimer: Pre-trained models are provided on an "as is" basis, without + warranties or conditions of any kind. The underlying model is provided by a + third party and subject to a separate license, available + [here](https://huggingface.co/docs/transformers/model_doc/electra#overview). + + Args: + vocabulary_size: int. The size of the token vocabulary. + num_layers: int. The number of transformer layers. + num_heads: int. The number of attention heads for each transformer. + The hidden size must be divisible by the number of attention heads. + hidden_dim: int. The size of the transformer encoding and pooler layers. + embedding_dim: int. The size of the token embeddings. + intermediate_dim: int. The output dimension of the first Dense layer in + a two-layer feedforward network for each transformer. + dropout: float. Dropout probability for the Transformer encoder. + max_sequence_length: int. The maximum sequence length that this encoder + can consume. If None, `max_sequence_length` uses the value from + sequence length. This determines the variable shape for positional + embeddings. + + Examples: + ```python + input_data = { + "token_ids": np.ones(shape=(1, 12), dtype="int32"), + "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]]), + "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]), + } + # Randomly initialized Electra encoder + backbone = keras_nlp.models.ElectraBackbone( + vocabulary_size=1000, + num_layers=2, + num_heads=2, + hidden_dim=32, + intermediate_dim=64, + dropout=0.1, + max_sequence_length=512, + ) + # Returns sequence and pooled outputs. + sequence_output, pooled_output = backbone(input_data) + ``` + """ + + def __init__( + self, + vocab_size, + num_layers, + num_heads, + hidden_dim, + embedding_dim, + intermediate_dim, + dropout=0.1, + max_sequence_length=512, + num_segments=2, + **kwargs, + ): + # Index of classification token in the vocabulary + cls_token_index = 0 + # Inputs + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + segment_id_input = keras.Input( + shape=(None,), dtype="int32", name="segment_ids" + ) + padding_mask = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + + # Embed tokens, positions, and segment ids. + token_embedding_layer = ReversibleEmbedding( + input_dim=vocab_size, + output_dim=embedding_dim, + embeddings_initializer=electra_kernel_initializer(), + name="token_embedding", + ) + token_embedding = token_embedding_layer(token_id_input) + position_embedding = PositionEmbedding( + initializer=electra_kernel_initializer(), + sequence_length=max_sequence_length, + name="position_embedding", + )(token_embedding) + segment_embedding = keras.layers.Embedding( + input_dim=num_segments, + output_dim=embedding_dim, + embeddings_initializer=electra_kernel_initializer(), + name="segment_embedding", + )(segment_id_input) + + # Add all embeddings together. + x = keras.layers.Add()( + (token_embedding, position_embedding, segment_embedding), + ) + # Layer normalization + x = keras.layers.LayerNormalization( + name="embeddings_layer_norm", + axis=-1, + epsilon=1e-12, + dtype="float32", + )(x) + # Dropout + x = keras.layers.Dropout( + dropout, + name="embeddings_dropout", + )(x) + if hidden_dim != embedding_dim: + x = keras.layers.Dense( + hidden_dim, + kernel_initializer=electra_kernel_initializer(), + name="embeddings_projection", + )(x) + + # Apply successive transformer encoder blocks. + for i in range(num_layers): + x = TransformerEncoder( + num_heads=num_heads, + intermediate_dim=intermediate_dim, + activation=gelu_approximate, + dropout=dropout, + layer_norm_epsilon=1e-12, + kernel_initializer=electra_kernel_initializer(), + name=f"transformer_layer_{i}", + )(x, padding_mask=padding_mask) + + sequence_output = x + # Construct the two ELECTRA outputs. The pooled output is a dense layer on + # top of the [CLS] token. + pooled_output = keras.layers.Dense( + hidden_dim, + kernel_initializer=electra_kernel_initializer(), + activation="tanh", + name="pooled_dense", + )(x[:, cls_token_index, :]) + + # Instantiate using Functional API Model constructor + super().__init__( + inputs={ + "token_ids": token_id_input, + "segment_ids": segment_id_input, + "padding_mask": padding_mask, + }, + outputs={ + "sequence_output": sequence_output, + "pooled_output": pooled_output, + }, + **kwargs, + ) + + # All references to self below this line + self.vocab_size = vocab_size + self.num_layers = num_layers + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.embedding_dim = embedding_dim + self.intermediate_dim = intermediate_dim + self.dropout = dropout + self.max_sequence_length = max_sequence_length + self.num_segments = num_segments + self.cls_token_index = cls_token_index + self.token_embedding = token_embedding_layer + + def get_config(self): + config = super().get_config() + config.update( + { + "vocab_size": self.vocab_size, + "num_layers": self.num_layers, + "num_heads": self.num_heads, + "hidden_dim": self.hidden_dim, + "embedding_dim": self.embedding_dim, + "intermediate_dim": self.intermediate_dim, + "dropout": self.dropout, + "max_sequence_length": self.max_sequence_length, + "num_segments": self.num_segments, + } + ) + return config diff --git a/keras_nlp/models/electra/electra_backbone_test.py b/keras_nlp/models/electra/electra_backbone_test.py new file mode 100644 index 0000000000..09e6c53344 --- /dev/null +++ b/keras_nlp/models/electra/electra_backbone_test.py @@ -0,0 +1,56 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from keras_nlp.backend import ops +from keras_nlp.models.electra.electra_backbone import ElectraBackbone +from keras_nlp.tests.test_case import TestCase + + +class ElectraBackboneTest(TestCase): + def setUp(self): + self.init_kwargs = { + "vocab_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "embedding_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "segment_ids": ops.zeros((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), + } + + def test_backbone_basics(self): + self.run_backbone_test( + cls=ElectraBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape={ + "sequence_output": (2, 5, 2), + "pooled_output": (2, 2), + }, + ) + + @pytest.mark.large + def test_saved_model(self): + self.run_model_saving_test( + cls=ElectraBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/electra/electra_tokenizer.py b/keras_nlp/models/electra/electra_tokenizer.py new file mode 100644 index 0000000000..4fb7829424 --- /dev/null +++ b/keras_nlp/models/electra/electra_tokenizer.py @@ -0,0 +1,88 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from keras_nlp.backend import keras +from keras_nlp.tokenizers import WordPieceTokenizer + + +@keras.saving.register_keras_serializable(package="keras_nlp") +class ElectraTokenizer(WordPieceTokenizer): + """A ELECTRA tokenizer using WordPiece subword segmentation. + + This tokenizer class will tokenize raw strings into integer sequences and + is based on `keras_nlp.tokenizers.WordPieceTokenizer`. + + If input is a batch of strings (rank > 0), the layer will output a + `tf.RaggedTensor` where the last dimension of the output is ragged. + + If input is a scalar string (rank == 0), the layer will output a dense + `tf.Tensor` with static shape `[None]`. + + Args: + vocabulary: A list of strings or a string filename path. If + passing a list, each element of the list should be a single word + piece token string. If passing a filename, the file should be a + plain text file containing a single word piece token per line. + lowercase: If `True`, the input text will be first lowered before + tokenization. + + Examples: + ```python + # Custom Vocabulary. + vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] + vocab += ["The", "quick", "brown", "fox", "jumped", "."] + + # Instantiate the tokenizer. + tokenizer = keras_nlp.models.ElectraTokenizer(vocabulary=vocab) + + # Unbatched input. + tokenizer("The quick brown fox jumped.") + + # Batched input. + tokenizer(["The quick brown fox jumped.", "The fox slept."]) + + # Detokenization. + tokenizer.detokenize(tokenizer("The quick brown fox jumped.")) + ``` + """ + + def __init__(self, vocabulary, lowercase=False, **kwargs): + self.cls_token = "[CLS]" + self.sep_token = "[SEP]" + self.pad_token = "[PAD]" + self.mask_token = "[MASK]" + super().__init__(vocabulary=vocabulary, lowercase=lowercase, **kwargs) + + def set_vocabulary(self, vocabulary): + super().set_vocabulary(vocabulary) + + if vocabulary is not None: + # Check for necessary special tokens. + for token in [self.cls_token, self.pad_token, self.sep_token]: + if token not in self.vocabulary: + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.cls_token_id = self.token_to_id(self.cls_token) + self.sep_token_id = self.token_to_id(self.sep_token) + self.pad_token_id = self.token_to_id(self.pad_token) + self.mask_token_id = self.token_to_id(self.mask_token) + else: + self.cls_token_id = None + self.sep_token_id = None + self.pad_token_id = None + self.mask_token_id = None diff --git a/keras_nlp/models/electra/electra_tokenizer_test.py b/keras_nlp/models/electra/electra_tokenizer_test.py new file mode 100644 index 0000000000..2e06fb900c --- /dev/null +++ b/keras_nlp/models/electra/electra_tokenizer_test.py @@ -0,0 +1,42 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from keras_nlp.models.electra.electra_tokenizer import ElectraTokenizer +from keras_nlp.tests.test_case import TestCase + + +class ElectraTokenizerTest(TestCase): + def setUp(self): + self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + self.vocab += ["THE", "QUICK", "BROWN", "FOX"] + self.vocab += ["the", "quick", "brown", "fox"] + self.init_kwargs = {"vocabulary": self.vocab} + self.input_data = ["THE QUICK BROWN FOX", "THE FOX"] + + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=ElectraTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[5, 6, 7, 8], [5, 8]], + ) + + def test_lowercase(self): + tokenizer = ElectraTokenizer(vocabulary=self.vocab, lowercase=True) + output = tokenizer(self.input_data) + self.assertAllEqual(output, [[9, 10, 11, 12], [9, 12]]) + + def test_errors_missing_special_tokens(self): + with self.assertRaises(ValueError): + ElectraTokenizer(vocabulary=["a", "b", "c"]) diff --git a/keras_nlp/models/f_net/f_net_backbone.py b/keras_nlp/models/f_net/f_net_backbone.py index a2e6e8ce95..ac4d290b02 100644 --- a/keras_nlp/models/f_net/f_net_backbone.py +++ b/keras_nlp/models/f_net/f_net_backbone.py @@ -21,6 +21,7 @@ from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding from keras_nlp.models.backbone import Backbone from keras_nlp.models.f_net.f_net_presets import backbone_presets +from keras_nlp.utils.keras_utils import gelu_approximate from keras_nlp.utils.python_utils import classproperty @@ -156,9 +157,7 @@ def __init__( for i in range(num_layers): x = FNetEncoder( intermediate_dim=intermediate_dim, - activation=lambda x: keras.activations.gelu( - x, approximate=True - ), + activation=gelu_approximate, dropout=dropout, layer_norm_epsilon=1e-12, kernel_initializer=f_net_kernel_initializer(), diff --git a/keras_nlp/models/f_net/f_net_backbone_test.py b/keras_nlp/models/f_net/f_net_backbone_test.py index 39236fa5f1..25dfaf799a 100644 --- a/keras_nlp/models/f_net/f_net_backbone_test.py +++ b/keras_nlp/models/f_net/f_net_backbone_test.py @@ -12,98 +12,75 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.models.f_net.f_net_backbone import FNetBackbone from keras_nlp.tests.test_case import TestCase class FNetBackboneTest(TestCase): def setUp(self): - self.backbone = FNetBackbone( - vocabulary_size=10, - num_layers=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - num_segments=4, - ) - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "segment_ids": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "segment_ids": ops.zeros((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_f_net(self): - self.backbone(self.input_batch) - - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "f_net_backbone") - - def test_variable_sequence_length_call_f_net(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "segment_ids": np.ones((2, seq_length), dtype="int32"), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=FNetBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape={ + "sequence_output": (2, 5, 2), + "pooled_output": (2, 2), + }, ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, FNetBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose( - model_output["pooled_output"], restored_output["pooled_output"] + self.run_model_saving_test( + cls=FNetBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=FNetBackbone, + preset="f_net_base_en", + input_data={ + "token_ids": ops.array([[101, 1996, 4248, 102]], dtype="int32"), + "segment_ids": ops.zeros((1, 4), dtype="int32"), + }, + expected_output_shape={ + "sequence_output": (1, 4, 768), + "pooled_output": (1, 768), + }, + # The forward pass from a preset should be stable! + expected_partial_output={ + "sequence_output": ( + ops.array([4.15728, -0.09661, -0.24494, -0.06810, -0.55959]) + ), + "pooled_output": ( + ops.array([-0.04117, -0.03273, -0.02134, 0.99754, -0.09777]) + ), + }, + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class FNetBackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = FNetBackbone( - vocabulary_size=100, - num_layers=2, - hidden_dim=16, - intermediate_dim=32, - max_sequence_length=128, - num_segments=4, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in FNetBackbone.presets: + self.run_preset_test( + cls=FNetBackbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "token_ids": np.ones((8, 128), dtype="int32"), - "segment_ids": np.ones((8, 128), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) diff --git a/keras_nlp/models/f_net/f_net_classifier_test.py b/keras_nlp/models/f_net/f_net_classifier_test.py index 66ec838470..4defce4a71 100644 --- a/keras_nlp/models/f_net/f_net_classifier_test.py +++ b/keras_nlp/models/f_net/f_net_classifier_test.py @@ -12,16 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os -import numpy as np import pytest -import sentencepiece -import tensorflow as tf -from keras_nlp.backend import keras -from keras_nlp.backend import ops from keras_nlp.models.f_net.f_net_backbone import FNetBackbone from keras_nlp.models.f_net.f_net_classifier import FNetClassifier from keras_nlp.models.f_net.f_net_preprocessor import FNetPreprocessor @@ -31,117 +25,57 @@ class FNetClassifierTest(TestCase): def setUp(self): - # Setup Model - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] - ) - - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=3, - unk_id=0, - bos_id=4, - eos_id=5, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - - self.proto = bytes_io.getvalue() - + # Setup model. self.preprocessor = FNetPreprocessor( - tokenizer=FNetTokenizer(proto=self.proto), - sequence_length=8, + FNetTokenizer( + # Generated using create_f_net_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "f_net_test_vocab.spm" + ) + ), + sequence_length=5, ) self.backbone = FNetBackbone( vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(), num_layers=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, + max_sequence_length=self.preprocessor.sequence_length, ) - self.classifier = FNetClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - # Check we handle serialization correctly. - activation=keras.activations.softmax, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + "num_classes": 2, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. + [1, 0], # Labels. ) - - # Setup data. - self.raw_batch = [ - "the quick brown fox.", - "the slow brown fox.", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - (self.raw_batch, np.ones((2,))) - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.classifier(self.preprocessed_batch) - - def test_classifier_predict(self): - preds1 = self.classifier.predict(self.raw_batch) - self.classifier.preprocessor = None - preds2 = self.classifier.predict(self.preprocessed_batch) - # Assert predictions match. - self.assertAllClose(preds1, preds2) - # Assert valid softmax output. - self.assertAllClose(ops.sum(preds2, axis=-1), [1.0, 1.0]) - - def test_fnet_classifier_fit(self): - self.classifier.fit(self.raw_dataset) - self.classifier.preprocessor = None - self.classifier.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.classifier.preprocessor = None - self.classifier.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.input_data = self.preprocessor(*self.train_data)[0] + + def test_classifier_basics(self): + self.run_task_test( + cls=FNetClassifier, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 2), ) - self.classifier.fit(self.preprocessed_dataset) - - def test_serialization(self): - # Defaults. - original = FNetClassifier( - self.backbone, - num_classes=2, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) - # With options. - original = FNetClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - activation=keras.activations.softmax, - name="test", - trainable=False, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) @pytest.mark.large def test_saved_model(self): - model_output = self.classifier.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.classifier.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, FNetClassifier) + self.run_model_saving_test( + cls=FNetClassifier, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in FNetClassifier.presets: + self.run_preset_test( + cls=FNetClassifier, + preset=preset, + init_kwargs={"num_classes": 2}, + input_data=self.input_data, + expected_output_shape=(2, 2), + ) diff --git a/keras_nlp/models/f_net/f_net_masked_lm_preprocessor.py b/keras_nlp/models/f_net/f_net_masked_lm_preprocessor.py index 87fa1a316d..51b4a4d1e7 100644 --- a/keras_nlp/models/f_net/f_net_masked_lm_preprocessor.py +++ b/keras_nlp/models/f_net/f_net_masked_lm_preprocessor.py @@ -136,18 +136,27 @@ def __init__( truncate=truncate, **kwargs, ) - + self.mask_selection_rate = mask_selection_rate + self.mask_selection_length = mask_selection_length + self.mask_token_rate = mask_token_rate + self.random_token_rate = random_token_rate + self.masker = None + + def build(self, input_shape): + super().build(input_shape) + # Defer masker creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.masker = MaskedLMMaskGenerator( - mask_selection_rate=mask_selection_rate, - mask_selection_length=mask_selection_length, - mask_token_rate=mask_token_rate, - random_token_rate=random_token_rate, - vocabulary_size=tokenizer.vocabulary_size(), - mask_token_id=tokenizer.mask_token_id, + mask_selection_rate=self.mask_selection_rate, + mask_selection_length=self.mask_selection_length, + mask_token_rate=self.mask_token_rate, + random_token_rate=self.random_token_rate, + vocabulary_size=self.tokenizer.vocabulary_size(), + mask_token_id=self.tokenizer.mask_token_id, unselectable_token_ids=[ - tokenizer.cls_token_id, - tokenizer.sep_token_id, - tokenizer.pad_token_id, + self.tokenizer.cls_token_id, + self.tokenizer.sep_token_id, + self.tokenizer.pad_token_id, ], ) @@ -155,10 +164,10 @@ def get_config(self): config = super().get_config() config.update( { - "mask_selection_rate": self.masker.mask_selection_rate, - "mask_selection_length": self.masker.mask_selection_length, - "mask_token_rate": self.masker.mask_token_rate, - "random_token_rate": self.masker.random_token_rate, + "mask_selection_rate": self.mask_selection_rate, + "mask_selection_length": self.mask_selection_length, + "mask_token_rate": self.mask_token_rate, + "random_token_rate": self.random_token_rate, } ) return config diff --git a/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py b/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py index d926f66566..5f72081a0d 100644 --- a/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.f_net.f_net_masked_lm_preprocessor import ( FNetMaskedLMPreprocessor, ) @@ -27,103 +25,63 @@ class FNetMaskedLMPreprocessorTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + self.tokenizer = FNetTokenizer( + # Generated using create_f_net_test_proto.py + proto=os.path.join(self.get_test_data_dir(), "f_net_test_vocab.spm") ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", + self.init_kwargs = { + "tokenizer": self.tokenizer, + # Simplify our testing by masking every available token. + "mask_selection_rate": 1.0, + "mask_token_rate": 1.0, + "random_token_rate": 0.0, + "mask_selection_length": 4, + "sequence_length": 12, + } + self.input_data = ["the quick brown fox"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=FNetMaskedLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[2, 4, 4, 4, 4, 3, 0, 0, 0, 0, 0, 0]], + "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[1, 2, 3, 4]], + }, + [[5, 10, 6, 8]], + [[1.0, 1.0, 1.0, 1.0]], + ), ) - self.proto = bytes_io.getvalue() - - self.preprocessor = FNetMaskedLMPreprocessor( - tokenizer=FNetTokenizer(proto=self.proto), - mask_selection_rate=1.0, - mask_token_rate=1.0, - random_token_rate=0.0, - mask_selection_length=4, - sequence_length=12, - ) - - def test_preprocess_strings(self): - input_data = "the quick brown fox" - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [1, 4, 4, 4, 4, 2, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [1, 2, 3, 4]) - self.assertAllEqual(y, [5, 10, 6, 8]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0]) - - def test_preprocess_list_of_strings(self): - input_data = ["the quick brown fox"] * 4 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [[1, 4, 4, 4, 4, 2, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4]] * 4) - self.assertAllEqual(y, [[5, 10, 6, 8]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0]] * 4) - - def test_preprocess_dataset(self): - sentences = tf.constant(["the quick brown fox"] * 4) - ds = tf.data.Dataset.from_tensor_slices(sentences) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x["token_ids"], [[1, 4, 4, 4, 4, 2, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4]] * 4) - self.assertAllEqual(y, [[5, 10, 6, 8]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0]] * 4) - - def test_mask_multiple_sentences(self): - sentence_one = tf.constant("the quick") - sentence_two = tf.constant("brown fox") - - x, y, sw = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - x["token_ids"], [1, 4, 4, 2, 4, 4, 2, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [1, 2, 4, 5]) - self.assertAllEqual(y, [5, 10, 6, 8]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0]) def test_no_masking_zero_rate(self): no_mask_preprocessor = FNetMaskedLMPreprocessor( - self.preprocessor.tokenizer, + self.tokenizer, mask_selection_rate=0.0, mask_selection_length=4, sequence_length=12, ) - input_data = "the quick brown fox" - - x, y, sw = no_mask_preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [1, 5, 10, 6, 8, 2, 0, 0, 0, 0, 0, 0] + input_data = ["the quick brown fox"] + self.assertAllClose( + no_mask_preprocessor(input_data), + ( + { + "token_ids": [[2, 5, 10, 6, 8, 3, 0, 0, 0, 0, 0, 0]], + "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[0, 0, 0, 0]], + }, + [[0, 0, 0, 0]], + [[0.0, 0.0, 0.0, 0.0]], + ), ) - self.assertAllEqual(x["mask_positions"], [0, 0, 0, 0]) - self.assertAllEqual(y, [0, 0, 0, 0]) - self.assertAllEqual(sw, [0.0, 0.0, 0.0, 0.0]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in FNetMaskedLMPreprocessor.presets: + self.run_preset_test( + cls=FNetMaskedLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/f_net/f_net_masked_lm_test.py b/keras_nlp/models/f_net/f_net_masked_lm_test.py index d4e9e548f1..1db6b361ed 100644 --- a/keras_nlp/models/f_net/f_net_masked_lm_test.py +++ b/keras_nlp/models/f_net/f_net_masked_lm_test.py @@ -12,14 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os import pytest -import sentencepiece -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.f_net.f_net_backbone import FNetBackbone from keras_nlp.models.f_net.f_net_masked_lm import FNetMaskedLM from keras_nlp.models.f_net.f_net_masked_lm_preprocessor import ( @@ -31,29 +27,14 @@ class FNetMaskedLMTest(TestCase): def setUp(self): - # Setup Model. - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the slow brown fox"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=5, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() + # Setup model. self.preprocessor = FNetMaskedLMPreprocessor( - FNetTokenizer(proto=self.proto), + FNetTokenizer( + # Generated using create_f_net_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "f_net_test_vocab.spm" + ) + ), # Simplify our testing by masking every available token. mask_selection_rate=1.0, mask_token_rate=1.0, @@ -66,62 +47,38 @@ def setUp(self): num_layers=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.masked_lm = FNetMaskedLM( - self.backbone, - preprocessor=self.preprocessor, + max_sequence_length=self.preprocessor.sequence_length, ) - - self.raw_batch = [ - "the quick brown fox", - "the slow brown fox", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch)[0] - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.masked_lm(self.preprocessed_batch) - - def test_predict(self): - # self.masked_lm.predict(self.raw_batch) - self.masked_lm.preprocessor = None - self.masked_lm.predict(self.preprocessed_batch) - - def test_fit(self): - self.masked_lm.fit(self.raw_dataset) - self.masked_lm.preprocessor = None - self.masked_lm.fit(self.preprocessed_dataset) - - def test_fit_no_xla(self): - self.masked_lm.preprocessor = None - self.masked_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. ) - self.masked_lm.fit(self.preprocessed_dataset) + self.input_data = self.preprocessor(*self.train_data)[0] - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.masked_lm) - new_classifier = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_classifier.get_config(), - self.masked_lm.get_config(), + def test_masked_lm_basics(self): + self.run_task_test( + cls=FNetMaskedLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 5, 12), ) @pytest.mark.large def test_saved_model(self): - model_output = self.masked_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.masked_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, FNetMaskedLM) + self.run_model_saving_test( + cls=FNetMaskedLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output, atol=0.01, rtol=0.01) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in FNetMaskedLM.presets: + self.run_preset_test( + cls=FNetMaskedLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/f_net/f_net_preprocessor.py b/keras_nlp/models/f_net/f_net_preprocessor.py index 5ebd5d1645..296493c930 100644 --- a/keras_nlp/models/f_net/f_net_preprocessor.py +++ b/keras_nlp/models/f_net/f_net_preprocessor.py @@ -129,20 +129,28 @@ def __init__( ): super().__init__(**kwargs) self.tokenizer = tokenizer + self.truncate = truncate + self.sequence_length = sequence_length + self.packer = None + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = MultiSegmentPacker( start_value=self.tokenizer.cls_token_id, end_value=self.tokenizer.sep_token_id, pad_value=self.tokenizer.pad_token_id, - truncate=truncate, - sequence_length=sequence_length, + truncate=self.truncate, + sequence_length=self.sequence_length, ) + self.built = True def get_config(self): config = super().get_config() config.update( { - "sequence_length": self.packer.sequence_length, - "truncate": self.packer.truncate, + "sequence_length": self.sequence_length, + "truncate": self.truncate, } ) return config diff --git a/keras_nlp/models/f_net/f_net_preprocessor_test.py b/keras_nlp/models/f_net/f_net_preprocessor_test.py index 638df89123..f67737c828 100644 --- a/keras_nlp/models/f_net/f_net_preprocessor_test.py +++ b/keras_nlp/models/f_net/f_net_preprocessor_test.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.f_net.f_net_preprocessor import FNetPreprocessor from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer from keras_nlp.tests.test_case import TestCase @@ -25,121 +23,46 @@ class FNetPreprocessorTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + self.tokenizer = FNetTokenizer( + # Generated using create_f_net_test_proto.py + proto=os.path.join(self.get_test_data_dir(), "f_net_test_vocab.spm") + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ( + ["the quick brown fox"], + [1], # Pass through labels. + [1.0], # Pass through sample_weights. ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=3, - unk_id=0, - bos_id=4, - eos_id=5, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - - self.preprocessor = FNetPreprocessor( - tokenizer=FNetTokenizer(proto=self.proto), - sequence_length=12, - ) - - def test_tokenize_strings(self): - input_data = "the quick brown fox" - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], [4, 2, 10, 6, 8, 5, 3, 3, 3, 3, 3, 3] - ) - self.assertAllEqual( - output["segment_ids"], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - ) - - def test_tokenize_list_of_strings(self): - # We should handle a list of strings as batch. - input_data = ["the quick brown fox"] * 4 - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], - [[4, 2, 10, 6, 8, 5, 3, 3, 3, 3, 3, 3]] * 4, - ) - self.assertAllEqual( - output["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] * 4 - ) - - def test_tokenize_labeled_batch(self): - x = tf.constant(["the quick brown fox"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - x_out, y_out, sw_out = self.preprocessor(x, y, sw) - self.assertAllEqual( - x_out["token_ids"], - [[4, 2, 10, 6, 8, 5, 3, 3, 3, 3, 3, 3]] * 4, - ) - self.assertAllEqual( - x_out["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_labeled_dataset(self): - x = tf.constant(["the quick brown fox"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - ds = tf.data.Dataset.from_tensor_slices((x, y, sw)) - ds = ds.map(self.preprocessor) - x_out, y_out, sw_out = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x_out["token_ids"], - [[4, 2, 10, 6, 8, 5, 3, 3, 3, 3, 3, 3]] * 4, - ) - self.assertAllEqual( - x_out["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - def test_tokenize_multiple_sentences(self): - sentence_one = tf.constant("the quick brown fox") - sentence_two = tf.constant("the earth") - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], - [4, 2, 10, 6, 8, 5, 2, 7, 5, 3, 3, 3], - ) - self.assertAllEqual( - output["segment_ids"], [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0] - ) - - def test_tokenize_multiple_batched_sentences(self): - sentence_one = tf.constant(["the quick brown fox"] * 4) - sentence_two = tf.constant(["the earth"] * 4) - # The first tuple or list is always interpreted as an enumeration of - # separate sequences to concatenate. - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], - [[4, 2, 10, 6, 8, 5, 2, 7, 5, 3, 3, 3]] * 4, - ) - self.assertAllEqual( - output["segment_ids"], [[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0]] * 4 + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=FNetPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[2, 5, 10, 6, 8, 3, 0, 0]], + "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0]], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ), ) def test_errors_for_2d_list_input(self): + preprocessor = FNetPreprocessor(**self.init_kwargs) ambiguous_input = [["one", "two"], ["three", "four"]] with self.assertRaises(ValueError): - self.preprocessor(ambiguous_input) + preprocessor(ambiguous_input) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in FNetPreprocessor.presets: + self.run_preset_test( + cls=FNetPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/f_net/f_net_presets.py b/keras_nlp/models/f_net/f_net_presets.py index b3df5f8e2c..13e0e2482a 100644 --- a/keras_nlp/models/f_net/f_net_presets.py +++ b/keras_nlp/models/f_net/f_net_presets.py @@ -25,20 +25,7 @@ "path": "f_net", "model_card": "https://github.com/google-research/google-research/blob/master/f_net/README.md", }, - "config": { - "vocabulary_size": 32000, - "num_layers": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 4, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/f_net_base_en/v1/model.h5", - "weights_hash": "35db90842b85a985a0e54c86c00746fe", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/f_net_base_en/v1/vocab.spm", - "spm_proto_hash": "71c5f4610bef1daf116998a113a01f3d", + "kaggle_handle": "kaggle://keras/f_net/keras/f_net_base_en/2", }, "f_net_large_en": { "metadata": { @@ -51,19 +38,6 @@ "path": "f_net", "model_card": "https://github.com/google-research/google-research/blob/master/f_net/README.md", }, - "config": { - "vocabulary_size": 32000, - "num_layers": 24, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "dropout": 0.1, - "max_sequence_length": 512, - "num_segments": 4, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/f_net_large_en/v1/model.h5", - "weights_hash": "7ae4a3faa67ff054f8cecffb5619f779", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/f_net_large_en/v1/vocab.spm", - "spm_proto_hash": "71c5f4610bef1daf116998a113a01f3d", + "kaggle_handle": "kaggle://keras/f_net/keras/f_net_large_en/2", }, } diff --git a/keras_nlp/models/f_net/f_net_presets_test.py b/keras_nlp/models/f_net/f_net_presets_test.py deleted file mode 100644 index 1f8f79c447..0000000000 --- a/keras_nlp/models/f_net/f_net_presets_test.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from absl.testing import parameterized - -from keras_nlp.backend import ops -from keras_nlp.models.f_net.f_net_backbone import FNetBackbone -from keras_nlp.models.f_net.f_net_classifier import FNetClassifier -from keras_nlp.models.f_net.f_net_preprocessor import FNetPreprocessor -from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.large -class FNetPresetSmokeTest(TestCase): - """ - A smoke test for FNet presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/f_net/f_net_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = FNetTokenizer.from_preset( - "f_net_base_en", - ) - outputs = tokenizer("The quick brown fox.") - expected_outputs = [97, 1467, 5187, 26, 2521, 16678] - self.assertAllEqual(outputs, expected_outputs) - - def test_preprocessor_output(self): - preprocessor = FNetPreprocessor.from_preset( - "f_net_base_en", - sequence_length=4, - ) - outputs = preprocessor("The quick brown fox.")["token_ids"] - expected_outputs = [4, 97, 1467, 5] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "token_ids": ops.array([[101, 1996, 4248, 102]]), - "segment_ids": ops.array([[0, 0, 0, 0]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = FNetBackbone.from_preset( - "f_net_base_en", load_weights=load_weights - ) - outputs = model(input_data)["sequence_output"] - if load_weights: - # The forward pass from a preset should be stable! - # This test should catch cases where we unintentionally change our - # network code in a way that would invalidate our preset weights. - # We should only update these numbers if we are updating a weights - # file, or have found a discrepancy with the upstream source. - outputs = outputs[0, 0, :5] - expected = [4.157282, -0.096616, -0.244943, -0.068104, -0.559592] - # Keep a high tolerance, so we are robust to different hardware. - self.assertAllClose(outputs, expected, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_classifier_output(self, load_weights): - input_data = ["The quick brown fox."] - model = FNetClassifier.from_preset( - "f_net_base_en", - num_classes=2, - load_weights=load_weights, - ) - # We don't assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("f_net_tokenizer", FNetTokenizer), - ("f_net_preprocessor", FNetPreprocessor), - ("f_net", FNetBackbone), - ("f_net_classifier", FNetClassifier), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("f_net_tokenizer", FNetTokenizer, {}), - ("f_net_preprocessor", FNetPreprocessor, {}), - ("f_net", FNetBackbone, {}), - ("f_net_classifier", FNetClassifier, {"num_classes": 2}), - ) - def test_unknown_preset_error(self, cls, kwargs): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("f_net_base_en_clowntown", **kwargs) - - -@pytest.mark.extra_large -class FNetPresetFullTest(TestCase): - """ - Test the full enumeration of our preset. - - This tests every FNet preset and is only run manually. - Run with: - `pytest keras_nlp/models/f_net/f_net_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_f_net(self, load_weights): - for preset in FNetBackbone.presets: - model = FNetBackbone.from_preset(preset, load_weights=load_weights) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), dtype="int64", maxval=model.vocabulary_size - ), - "segment_ids": ops.array([0] * 200 + [1] * 312, shape=(1, 512)), - } - model(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_load_fnet_classifier(self, load_weights): - for preset in FNetClassifier.presets: - classifier = FNetClassifier.from_preset( - preset, - num_classes=2, - load_weights=load_weights, - ) - input_data = ["The quick brown fox."] - classifier.predict(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_load_fnet_classifier_without_preprocessing(self, load_weights): - for preset in FNetClassifier.presets: - classifier = FNetClassifier.from_preset( - preset, - num_classes=2, - preprocessor=None, - load_weights=load_weights, - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), - dtype="int64", - maxval=classifier.backbone.vocabulary_size, - ), - "segment_ids": ops.array([0] * 200 + [1] * 312, shape=(1, 512)), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - classifier.predict(input_data) - - def test_load_tokenizers(self): - for preset in FNetTokenizer.presets: - tokenizer = FNetTokenizer.from_preset(preset) - tokenizer("The quick brown fox.") - - def test_load_preprocessors(self): - for preset in FNetPreprocessor.presets: - preprocessor = FNetPreprocessor.from_preset(preset) - preprocessor("The quick brown fox.") diff --git a/keras_nlp/models/f_net/f_net_tokenizer.py b/keras_nlp/models/f_net/f_net_tokenizer.py index 294e02e4db..ae3f569b1d 100644 --- a/keras_nlp/models/f_net/f_net_tokenizer.py +++ b/keras_nlp/models/f_net/f_net_tokenizer.py @@ -63,25 +63,37 @@ class FNetTokenizer(SentencePieceTokenizer): """ def __init__(self, proto, **kwargs): + self.cls_token = "[CLS]" + self.sep_token = "[SEP]" + self.pad_token = "" + self.mask_token = "[MASK]" super().__init__(proto=proto, **kwargs) - # Check for necessary special tokens. - cls_token = "[CLS]" - sep_token = "[SEP]" - pad_token = "" - mask_token = "[MASK]" - for token in [cls_token, sep_token, pad_token, mask_token]: - if token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{token}'` in the provided " - f"`vocabulary`. Please provide `'{token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.cls_token_id = self.token_to_id(cls_token) - self.sep_token_id = self.token_to_id(sep_token) - self.pad_token_id = self.token_to_id(pad_token) - self.mask_token_id = self.token_to_id(mask_token) + def set_proto(self, proto): + super().set_proto(proto) + if proto is not None: + for token in [ + self.cls_token, + self.sep_token, + self.pad_token, + self.mask_token, + ]: + if token not in self.get_vocabulary(): + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.cls_token_id = self.token_to_id(self.cls_token) + self.sep_token_id = self.token_to_id(self.sep_token) + self.pad_token_id = self.token_to_id(self.pad_token) + self.mask_token_id = self.token_to_id(self.mask_token) + else: + self.cls_token_id = None + self.sep_token_id = None + self.pad_token_id = None + self.mask_token_id = None @classproperty def presets(cls): diff --git a/keras_nlp/models/f_net/f_net_tokenizer_test.py b/keras_nlp/models/f_net/f_net_tokenizer_test.py index 259bba8ef4..3dde34e849 100644 --- a/keras_nlp/models/f_net/f_net_tokenizer_test.py +++ b/keras_nlp/models/f_net/f_net_tokenizer_test.py @@ -12,79 +12,55 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer from keras_nlp.tests.test_case import TestCase -@pytest.mark.tf_only class FNetTokenizerTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=3, - unk_id=0, - bos_id=4, - eos_id=5, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - - self.tokenizer = FNetTokenizer(proto=self.proto) - - def test_tokenize(self): - input_data = "the quick brown fox" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [2, 10, 6, 8]) - - def test_tokenize_batch(self): - input_data = ["the quick brown fox", "the earth is round"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[2, 10, 6, 8], [2, 7, 9, 11]]) + self.init_kwargs = { + # Generated using create_f_net_test_proto.py + "proto": os.path.join( + self.get_test_data_dir(), "f_net_test_vocab.spm" + ) + } + self.input_data = ["the quick brown fox", "the earth is round"] - def test_detokenize(self): - input_data = [[2, 10, 6, 8]] - output = self.tokenizer.detokenize(input_data) - self.assertEqual(output, ["the quick brown fox"]) - - def test_vocabulary_size(self): - tokenizer = FNetTokenizer(proto=self.proto) - self.assertEqual(tokenizer.vocabulary_size(), 12) + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=FNetTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]], + ) def test_errors_missing_special_tokens(self): - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(["abc"]), - model_writer=bytes_io, - vocab_size=5, - pad_id=-1, - eos_id=-1, - bos_id=-1, - ) with self.assertRaises(ValueError): - FNetTokenizer(proto=bytes_io.getvalue()) + FNetTokenizer( + # Generated using create_no_special_token_proto.py + proto=os.path.join( + self.get_test_data_dir(), "no_special_token_vocab.spm" + ) + ) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=FNetTokenizer, + preset="f_net_base_en", + input_data=["The quick brown fox."], + expected_output=[[97, 1467, 5187, 26, 2521, 16678]], ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in FNetTokenizer.presets: + self.run_preset_test( + cls=FNetTokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/gpt2/gpt2_backbone.py b/keras_nlp/models/gpt2/gpt2_backbone.py index cb734e3db7..89c23f71de 100644 --- a/keras_nlp/models/gpt2/gpt2_backbone.py +++ b/keras_nlp/models/gpt2/gpt2_backbone.py @@ -14,10 +14,6 @@ import copy -from tensorflow.experimental import dtensor -from tensorflow.experimental.dtensor import Layout -from tensorflow.keras.dtensor.experimental import LayoutMap - from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.layers.modeling.position_embedding import PositionEmbedding @@ -25,6 +21,7 @@ from keras_nlp.layers.modeling.transformer_decoder import TransformerDecoder from keras_nlp.models.backbone import Backbone from keras_nlp.models.gpt2.gpt2_presets import backbone_presets +from keras_nlp.utils.keras_utils import gelu_approximate from keras_nlp.utils.python_utils import classproperty @@ -65,7 +62,7 @@ class GPT2Backbone(Backbone): sequence length. This determines the variable shape for positional embeddings. - Example usage: + Example: ```python input_data = { "token_ids": np.ones(shape=(1, 12), dtype="int32"), @@ -139,9 +136,7 @@ def __init__( num_heads=num_heads, dropout=dropout, layer_norm_epsilon=1e-05, - activation=lambda x: keras.activations.gelu( - x, approximate=True - ), + activation=gelu_approximate, kernel_initializer=_gpt_2_kernel_initializer(stddev=0.02), normalize_first=True, name=f"transformer_layer_{i}", @@ -191,71 +186,3 @@ def get_config(self): @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - def create_layout_map(cls, mesh): - """Create a DTensor layout map for a GPT2Backbone. - - Given a DTensor mesh describing a list of devices, this method returns a - DTensor layout map for creating a `keras_nlp.models.GPT2Backbone` - instance. This mapping describes how to distribute all model weights - across multiple devices. For an overview of DTensor concepts, see - [this guide](https://www.tensorflow.org/guide/dtensor_overview). - - Args: - mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement - of devices for running distributed computation. The - first dimension in the mesh is expected to be for data parallel - distribution, and the second for model parallel distribution. - - Returns: - A `tf.keras.dtensor.experimental.LayoutMap` which contains the - proper layout to weights mapping for the model parallel setting. - - Examples: - ```python - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) - - # Update both dimensions below for a multi-device setting. - mesh = dtensor.create_mesh([("batch", 1), ("model", 1)]) - layout_map = keras_nlp.models.GPT2Backbone.create_layout_map(mesh) - - with layout_map.scope(): - model = keras_nlp.models.GPT2Backbone.from_preset("gpt2_base_en") - ``` - """ - # We assert the mesh is 2D, and assume the first mesh dim is for data - # parallel and the second dim is for model parallel. - mesh_shape = mesh.shape() - if len(mesh_shape) != 2: - raise ValueError( - f"Expect to create layout based on 2D mesh, received {mesh}" - ) - _, model_dim = mesh.dim_names - unshard_dim = dtensor.UNSHARDED - - layout_map = LayoutMap(mesh=mesh) - # Embedding sharding - layout_map[r".*embeddings"] = Layout([unshard_dim, model_dim], mesh) - - # Transformer block sharding - layout_map[r".*_(query|key|value)_dense.kernel"] = Layout( - [unshard_dim, unshard_dim, model_dim], mesh - ) - layout_map[r".*_(query|key|value)_dense.bias"] = Layout( - [model_dim, unshard_dim], mesh - ) - layout_map[r".*_feedforward_intermediate_dense.kernel"] = Layout( - [unshard_dim, model_dim], mesh - ) - layout_map[r".*_feedforward_intermediate_dense.bias"] = Layout( - [model_dim], mesh - ) - layout_map[r".*_feedforward_output_dense.kernel"] = Layout( - [model_dim, unshard_dim], mesh - ) - layout_map[r".*_feedforward_output_dense.bias"] = Layout( - [unshard_dim], mesh - ) - return layout_map diff --git a/keras_nlp/models/gpt2/gpt2_backbone_test.py b/keras_nlp/models/gpt2/gpt2_backbone_test.py index 1a1cedcfe4..d29bc68565 100644 --- a/keras_nlp/models/gpt2/gpt2_backbone_test.py +++ b/keras_nlp/models/gpt2/gpt2_backbone_test.py @@ -12,118 +12,65 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.models.gpt2.gpt2_backbone import GPT2Backbone from keras_nlp.tests.test_case import TestCase -class GPT2Test(TestCase): +class GPT2BackboneTest(TestCase): def setUp(self): - self.backbone = GPT2Backbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "segment_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_call(self): - self.backbone(self.input_batch) - - def test_token_embedding(self): - output = self.backbone.token_embedding(self.input_batch["token_ids"]) - self.assertEqual(output.shape, (2, 5, 2)) - - def test_name(self): - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "gpt2_backbone") - - def test_variable_sequence_length(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "padding_mask": np.ones((2, seq_length), dtype="int32"), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=GPT2Backbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 2), ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, GPT2Backbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose(model_output, restored_output) - - def test_create_layout_map(self): - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - with GPT2Backbone.create_layout_map(mesh).scope(): - GPT2Backbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - # Using DTensor enables the mlir bridge as a side effect. Eventually - # this will be default, but for now we have compile errors with the - # bridge elsewhere and must disable. See - # https://github.com/keras-team/keras-nlp/issues/1001 - tf.config.experimental.disable_mlir_bridge() + self.run_model_saving_test( + cls=GPT2Backbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=GPT2Backbone, + preset="gpt2_base_en", + input_data={ + "token_ids": ops.array([[1169, 2068, 7586, 21831, 13]]), + "padding_mask": ops.ones((1, 5), dtype="int32"), + }, + expected_output_shape=(1, 5, 768), + # The forward pass from a preset should be stable! + expected_partial_output=ops.array( + [-0.1116, -0.0375, -0.2624, 0.00891, -0.0061] + ), + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class GPT2BackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.model = GPT2Backbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in GPT2Backbone.presets: + self.run_preset_test( + cls=GPT2Backbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.model.compile() - self.model.predict(self.input_dataset) diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm.py b/keras_nlp/models/gpt2/gpt2_causal_lm.py index 23dcc41664..44eebd0a20 100644 --- a/keras_nlp/models/gpt2/gpt2_causal_lm.py +++ b/keras_nlp/models/gpt2/gpt2_causal_lm.py @@ -325,39 +325,3 @@ def next(prompt, cache, index): "token_ids": token_ids, "padding_mask": padding_mask, } - - @classmethod - def create_layout_map(cls, mesh): - """Create a DTensor layout map for a GPT2CausalLM. - - Given a DTensor mesh describing a list of devices, this method returns a - DTensor layout map for creating a `keras_nlp.models.GPT2CausalLM` - instance. This mapping describes how to distribute all model weights - across multiple devices. For an overview of DTensor concepts, see - [this guide](https://www.tensorflow.org/guide/dtensor_overview). - - Args: - mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement - of devices for running distributed computation. The - first dimension in the mesh is expected to be for data parallel - distribution, and the second for model parallel distribution. - - Returns: - A `keras.dtensor.experimental.LayoutMap` which contains the - proper layout to weights mapping for the model parallel setting. - - Examples: - ```python - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) - - # Update both dimensions below for a multi-device setting. - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - layout_map = keras_nlp.models.GPT2CausalLM.create_layout_map(mesh) - - with layout_map.scope(): - gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset("gpt2_base_en") - ``` - """ - # As this task has no new variables, we just re-use the backbone method. - return cls.backbone_cls.create_layout_map(mesh) diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor.py b/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor.py index b501ad3fe0..97d0b42d97 100644 --- a/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor.py +++ b/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor.py @@ -142,6 +142,9 @@ def generate_preprocess( the sequence (as generation is expected to continue at the end of the inputted prompt). """ + if not self.built: + self.build(None) + x = convert_inputs_to_list_of_tensor_segments(x)[0] x = self.tokenizer(x) token_ids, padding_mask = self.packer( @@ -162,11 +165,12 @@ def generate_postprocess( padding and start/end tokens, and then converting the integer sequence back to a string. """ + if not self.built: + self.build(None) + token_ids, padding_mask = x["token_ids"], x["padding_mask"] - if not isinstance(token_ids, tf.Tensor): - token_ids = ops.convert_to_numpy(token_ids) - if not isinstance(padding_mask, tf.Tensor): - padding_mask = ops.convert_to_numpy(padding_mask) + token_ids = ops.convert_to_numpy(token_ids) + padding_mask = ops.convert_to_numpy(padding_mask) # Strip any special tokens during detokenization (e.g. the start and # end markers). In the future we could make this configurable. padding_mask = padding_mask & (token_ids != self.tokenizer.end_token_id) diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor_test.py b/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor_test.py index 63ff66b194..400273b792 100644 --- a/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor_test.py +++ b/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.gpt2.gpt2_causal_lm_preprocessor import ( GPT2CausalLMPreprocessor, ) @@ -24,55 +23,42 @@ class GPT2CausalLMPreprocessorTest(TestCase): def setUp(self): - self.vocab = { - "!": 0, - "air": 1, - "Ġair": 2, - "plane": 3, - "Ġat": 4, - "port": 5, - "<|endoftext|>": 6, - } - + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["<|endoftext|>"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] self.merges += ["Ġai r", "Ġa i", "pla ne"] - - self.preprocessor = GPT2CausalLMPreprocessor( - tokenizer=GPT2Tokenizer( - vocabulary=self.vocab, - merges=self.merges, + self.tokenizer = GPT2Tokenizer( + vocabulary=self.vocab, + merges=self.merges, + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ["airplane at airport"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=GPT2CausalLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, + [[1, 3, 4, 2, 5, 6, 0, 0]], # Pass through labels. + [[1, 1, 1, 1, 1, 1, 0, 0]], # Pass through sample_weights. ), - sequence_length=8, ) - def test_strings(self): - input_data = "airplane at airport" - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 6, 0]) - self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - self.assertAllEqual(y, [1, 3, 4, 2, 5, 6, 0, 0]) - self.assertAllEqual(sw, [1, 1, 1, 1, 1, 1, 0, 0]) - - def test_list_of_strings(self): - input_data = ["airplane at airport"] * 4 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, [[1, 3, 4, 2, 5, 6, 0, 0]] * 4) - self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4) - def test_no_start_end_token(self): input_data = ["airplane at airport"] * 4 preprocessor = GPT2CausalLMPreprocessor( - tokenizer=GPT2Tokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, + **self.init_kwargs, add_start_token=False, add_end_token=False, ) @@ -82,44 +68,27 @@ def test_no_start_end_token(self): self.assertAllEqual(y, [[3, 4, 2, 5, 0, 0, 0, 0]] * 4) self.assertAllEqual(sw, [[1, 1, 1, 1, 0, 0, 0, 0]] * 4) - def test_labeled_batch(self): - x = tf.constant(["airplane at airport"] * 4) - y = tf.constant([1] * 4) # Ignored. - sw = tf.constant([1.0] * 4) # Ignored. - x, y, sw = self.preprocessor(x, y, sw) - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, [[1, 3, 4, 2, 5, 6, 0, 0]] * 4) - self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4) - - def test_dataset(self): - x = tf.constant(["airplane at airport"] * 4) - ds = tf.data.Dataset.from_tensor_slices(x) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(4).take(1).get_single_element() - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, [[1, 3, 4, 2, 5, 6, 0, 0]] * 4) - self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4) - def test_generate_preprocess(self): input_data = "airplane at airport" - x = self.preprocessor.generate_preprocess(input_data) + preprocessor = GPT2CausalLMPreprocessor(**self.init_kwargs) + x = preprocessor.generate_preprocess(input_data) self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 0, 0]) self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0]) def test_generate_postprocess(self): input_data = { - "token_ids": tf.constant([6, 1, 3, 4, 2, 5, 0, 0]), - "padding_mask": tf.cast([1, 1, 1, 1, 1, 1, 0, 0], dtype="bool"), + "token_ids": [6, 1, 3, 4, 2, 5, 0, 0], + "padding_mask": [1, 1, 1, 1, 1, 1, 0, 0], } - x = self.preprocessor.generate_postprocess(input_data) + preprocessor = GPT2CausalLMPreprocessor(**self.init_kwargs) + x = preprocessor.generate_postprocess(input_data) self.assertAllEqual(x, "airplane at airport") - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in GPT2CausalLMPreprocessor.presets: + self.run_preset_test( + cls=GPT2CausalLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm_test.py b/keras_nlp/models/gpt2/gpt2_causal_lm_test.py index c50b6c5cf4..f34b6baa47 100644 --- a/keras_nlp/models/gpt2/gpt2_causal_lm_test.py +++ b/keras_nlp/models/gpt2/gpt2_causal_lm_test.py @@ -12,13 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from unittest.mock import patch import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.gpt2.gpt2_backbone import GPT2Backbone from keras_nlp.models.gpt2.gpt2_causal_lm import GPT2CausalLM @@ -31,15 +28,9 @@ class GPT2CausalLMTest(TestCase): def setUp(self): - self.vocab = { - "!": 0, - "air": 1, - "Ġair": 2, - "plane": 3, - "Ġat": 4, - "port": 5, - "<|endoftext|>": 6, - } + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["<|endoftext|>"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] self.merges += ["Ġai r", "Ġa i", "pla ne"] @@ -53,68 +44,46 @@ def setUp(self): num_heads=2, hidden_dim=4, intermediate_dim=8, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.causal_lm = GPT2CausalLM( - backbone=self.backbone, - preprocessor=self.preprocessor, + max_sequence_length=self.preprocessor.sequence_length, ) - - self.raw_batch = [ - " airplane at airport", - " airplane at airport", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch)[0] - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_causal_lm(self): - self.causal_lm(self.preprocessed_batch) - - def test_predict(self): - self.causal_lm.predict(self.raw_batch) - self.causal_lm.preprocessor = None - self.causal_lm.predict(self.preprocessed_batch) - - def test_fit(self): - self.causal_lm.fit(self.raw_dataset) - self.causal_lm.preprocessor = None - self.causal_lm.fit(self.preprocessed_dataset) - - def test_fit_no_xla(self): - self.causal_lm.preprocessor = None - self.causal_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ([" airplane at airport", " airplane at airport"],) + self.input_data = self.preprocessor(*self.train_data)[0] + + def test_causal_lm_basics(self): + self.run_task_test( + cls=GPT2CausalLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 8, 7), ) - self.causal_lm.fit(self.preprocessed_dataset) def test_generate(self): + causal_lm = GPT2CausalLM(**self.init_kwargs) # String input. prompt = " airplane at airport" - output = self.causal_lm.generate(" airplane at airport") + output = causal_lm.generate(" airplane at airport") self.assertTrue(prompt in output) - # String tensor input. - self.assertIsInstance(self.causal_lm.generate(self.raw_batch)[0], str) - # String dataset input. - self.assertIsInstance(self.causal_lm.generate(self.raw_dataset)[0], str) # Int tensor input. - self.causal_lm.preprocessor = None - outputs = self.causal_lm.generate(self.preprocessed_batch) + prompt_ids = self.preprocessor.generate_preprocess([prompt]) + causal_lm.preprocessor = None + outputs = causal_lm.generate(prompt_ids) # Assert prompt is in output in token id space. self.assertAllEqual( outputs["token_ids"][:, :5], - self.preprocessed_batch["token_ids"][:, :5], + prompt_ids["token_ids"][:, :5], ) self.assertAllEqual( outputs["padding_mask"][:, :5], - self.preprocessed_batch["padding_mask"][:, :5], + prompt_ids["padding_mask"][:, :5], ) def test_early_stopping(self): - call_with_cache = self.causal_lm.call_with_cache + causal_lm = GPT2CausalLM(**self.init_kwargs) + call_with_cache = causal_lm.call_with_cache def wrapper(*args, **kwargs): """Modify output logits to always favor end_token_id""" @@ -125,53 +94,37 @@ def wrapper(*args, **kwargs): logits = ops.slice_update(logits, (0, 0, index), update) return logits, hidden_states, cache - with patch.object(self.causal_lm, "call_with_cache", wraps=wrapper): + with patch.object(causal_lm, "call_with_cache", wraps=wrapper): prompt = [" airplane at airport", " airplane"] - output = self.causal_lm.generate(prompt) + output = causal_lm.generate(prompt) # We should immediately abort and output the prompt. self.assertEqual(prompt, output) def test_generate_compilation(self): + causal_lm = GPT2CausalLM(**self.init_kwargs) # Assert we do not recompile with successive calls. - self.causal_lm.generate(self.raw_batch) - first_fn = self.causal_lm.generate_function - self.causal_lm.generate(self.raw_batch) - second_fn = self.causal_lm.generate_function + causal_lm.generate(" airplane at airport") + first_fn = causal_lm.generate_function + causal_lm.generate(" airplane at airport") + second_fn = causal_lm.generate_function self.assertEqual(first_fn, second_fn) # Assert we do recompile after compile is called. - self.causal_lm.compile(sampler="greedy") - self.assertIsNone(self.causal_lm.generate_function) - - def test_serialization(self): - new_causal_lm = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.causal_lm) - ) - self.assertEqual( - new_causal_lm.get_config(), self.causal_lm.get_config() - ) + causal_lm.compile(sampler="greedy") + self.assertIsNone(causal_lm.generate_function) @pytest.mark.large def test_saved_model(self): - keras.utils.set_random_seed(42) - model_output = self.causal_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.causal_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, GPT2CausalLM) - - # Check that output matches. - keras.utils.set_random_seed(42) - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + self.run_model_saving_test( + cls=GPT2CausalLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - def test_create_layout_map(self): - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - with GPT2CausalLM.create_layout_map(mesh).scope(): - GPT2CausalLM(backbone=self.backbone) - # Using DTensor enables the mlir bridge as a side effect. Eventually - # this will be default, but for now we have compile errors with the - # bridge elsewhere and must disable. See - # https://github.com/keras-team/keras-nlp/issues/1001 - tf.config.experimental.disable_mlir_bridge() + @pytest.mark.extra_large + def test_all_presets(self): + for preset in GPT2CausalLM.presets: + self.run_preset_test( + cls=GPT2CausalLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/gpt2/gpt2_preprocessor.py b/keras_nlp/models/gpt2/gpt2_preprocessor.py index bb37364364..29182f77b6 100644 --- a/keras_nlp/models/gpt2/gpt2_preprocessor.py +++ b/keras_nlp/models/gpt2/gpt2_preprocessor.py @@ -123,24 +123,18 @@ def __init__( self.sequence_length = sequence_length self.add_start_token = add_start_token self.add_end_token = add_end_token + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = StartEndPacker( - start_value=tokenizer.start_token_id, - end_value=tokenizer.end_token_id, - pad_value=tokenizer.pad_token_id, - sequence_length=sequence_length, + start_value=self.tokenizer.start_token_id, + end_value=self.tokenizer.end_token_id, + pad_value=self.tokenizer.pad_token_id, + sequence_length=self.sequence_length, return_padding_mask=True, ) - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config + self.built = True def call( self, @@ -170,6 +164,17 @@ def call( } return pack_x_y_sample_weight(x, y, sample_weight) + def get_config(self): + config = super().get_config() + config.update( + { + "sequence_length": self.sequence_length, + "add_start_token": self.add_start_token, + "add_end_token": self.add_end_token, + } + ) + return config + @classproperty def presets(cls): return copy.deepcopy(backbone_presets) diff --git a/keras_nlp/models/gpt2/gpt2_preprocessor_test.py b/keras_nlp/models/gpt2/gpt2_preprocessor_test.py index db221c7279..d7dcd261ed 100644 --- a/keras_nlp/models/gpt2/gpt2_preprocessor_test.py +++ b/keras_nlp/models/gpt2/gpt2_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.gpt2.gpt2_preprocessor import GPT2Preprocessor from keras_nlp.models.gpt2.gpt2_tokenizer import GPT2Tokenizer from keras_nlp.tests.test_case import TestCase @@ -22,41 +21,32 @@ class GPT2PreprocessorTest(TestCase): def setUp(self): - self.vocab = { - "!": 0, - "air": 1, - "Ġair": 2, - "plane": 3, - "Ġat": 4, - "port": 5, - "<|endoftext|>": 6, - } - + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["<|endoftext|>"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] self.merges += ["Ġai r", "Ġa i", "pla ne"] - - self.preprocessor = GPT2Preprocessor( - tokenizer=GPT2Tokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, + self.tokenizer = GPT2Tokenizer( + vocabulary=self.vocab, + merges=self.merges, + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ["airplane at airport"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=GPT2Preprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output={ + "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, ) - - def test_tokenize_strings(self): - input_data = "airplane at airport" - - x = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 6, 0]) - self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - - def test_tokenize_list_of_strings(self): - input_data = ["airplane at airport"] * 4 - - x = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) def test_no_start_end_token(self): input_data = ["airplane at airport"] * 4 @@ -74,33 +64,17 @@ def test_no_start_end_token(self): self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4) self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4) - def test_tokenize_labeled_batch(self): - x = tf.constant(["airplane at airport"] * 4) - y_in = tf.constant([1] * 4) - sw_in = tf.constant([1.0] * 4) - x, y, sw = self.preprocessor(x, y_in, sw_in) - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, y_in) - self.assertAllEqual(sw, sw_in) - - def test_tokenize_labeled_dataset(self): - x = tf.constant(["airplane at airport"] * 4) - ds = tf.data.Dataset.from_tensor_slices(x) - ds = ds.map(self.preprocessor) - x = ds.batch(4).take(1).get_single_element() - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - def test_sequence_length_override(self): input_data = "airplane at airport" - x = self.preprocessor(input_data, sequence_length=4) + preprocessor = GPT2Preprocessor(**self.init_kwargs) + x = preprocessor(input_data, sequence_length=4) self.assertAllEqual(x["token_ids"], [6, 1, 3, 6]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in GPT2Preprocessor.presets: + self.run_preset_test( + cls=GPT2Preprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/gpt2/gpt2_presets.py b/keras_nlp/models/gpt2/gpt2_presets.py index 7101bdb104..c51f170aa2 100644 --- a/keras_nlp/models/gpt2/gpt2_presets.py +++ b/keras_nlp/models/gpt2/gpt2_presets.py @@ -26,22 +26,7 @@ "path": "gpt2", "model_card": "https://github.com/openai/gpt-2/blob/master/model_card.md", }, - "config": { - "vocabulary_size": 50257, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 1024, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/model.h5", - "weights_hash": "f4ea6e1b214516dd7de452461ee6e16e", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/vocab.json", - "vocabulary_hash": "dffec25a898b1f5e569bec4dffd7e5c0", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/gpt2/keras/gpt2_base_en/2", }, "gpt2_medium_en": { "metadata": { @@ -54,22 +39,7 @@ "path": "gpt2", "model_card": "https://github.com/openai/gpt-2/blob/master/model_card.md", }, - "config": { - "vocabulary_size": 50257, - "num_layers": 24, - "num_heads": 16, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "dropout": 0.1, - "max_sequence_length": 1024, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_medium_en/v1/model.h5", - "weights_hash": "580ff9b79c04fc90e6d6f47e975c5afe", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_medium_en/v1/vocab.json", - "vocabulary_hash": "dffec25a898b1f5e569bec4dffd7e5c0", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_medium_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/gpt2/keras/gpt2_medium_en/2", }, "gpt2_large_en": { "metadata": { @@ -82,22 +52,7 @@ "path": "gpt2", "model_card": "https://github.com/openai/gpt-2/blob/master/model_card.md", }, - "config": { - "vocabulary_size": 50257, - "num_layers": 36, - "num_heads": 20, - "hidden_dim": 1280, - "intermediate_dim": 5120, - "dropout": 0.1, - "max_sequence_length": 1024, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_large_en/v1/model.h5", - "weights_hash": "67957cb3dfc9e965960dabe068811e1a", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_large_en/v1/vocab.json", - "vocabulary_hash": "dffec25a898b1f5e569bec4dffd7e5c0", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_large_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/gpt2/keras/gpt2_large_en/2", }, "gpt2_extra_large_en": { "metadata": { @@ -110,22 +65,7 @@ "path": "gpt2", "model_card": "https://github.com/openai/gpt-2/blob/master/model_card.md", }, - "config": { - "vocabulary_size": 50257, - "num_layers": 48, - "num_heads": 25, - "hidden_dim": 1600, - "intermediate_dim": 6400, - "dropout": 0.1, - "max_sequence_length": 1024, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_extra_large_en/v1/model.h5", - "weights_hash": "d093c1ee0d9705d845c0190909aa2917", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_extra_large_en/v1/vocab.json", - "vocabulary_hash": "dffec25a898b1f5e569bec4dffd7e5c0", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_extra_large_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/gpt2/keras/gpt2_extra_large_en/2", }, "gpt2_base_en_cnn_dailymail": { "metadata": { @@ -137,21 +77,6 @@ "official_name": "GPT-2", "path": "gpt2", }, - "config": { - "vocabulary_size": 50257, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 1024, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_base_en_news/v1/model.h5", - "weights_hash": "09d86ca6e1b4213886b720a1392f2a70", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_base_en_news/v1/vocab.json", - "vocabulary_hash": "dffec25a898b1f5e569bec4dffd7e5c0", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/gpt2_base_en_news/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/gpt2/keras/gpt2_base_en_cnn_dailymail/2", }, } diff --git a/keras_nlp/models/gpt2/gpt2_presets_test.py b/keras_nlp/models/gpt2/gpt2_presets_test.py deleted file mode 100644 index 37e7e53e87..0000000000 --- a/keras_nlp/models/gpt2/gpt2_presets_test.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from absl.testing import parameterized - -from keras_nlp.backend import ops -from keras_nlp.models.gpt2.gpt2_backbone import GPT2Backbone -from keras_nlp.models.gpt2.gpt2_tokenizer import GPT2Tokenizer -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.large -class GPT2PresetSmokeTest(TestCase): - """ - A smoke test for GPT-2 presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/gpt2/gpt2_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en") - outputs = tokenizer("The quick brown fox.") - expected_outputs = [464, 2068, 7586, 21831, 13] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "token_ids": ops.array([[1169, 2068, 7586, 21831, 13]]), - "padding_mask": ops.array([[1, 1, 1, 1, 1]]), - } - model = GPT2Backbone.from_preset( - "gpt2_base_en", load_weights=load_weights - ) - outputs = model(input_data)[0, 0, :5] - if load_weights: - # The forward pass from a preset should be stable! - # This test should catch cases where we unintentionally change our - # network code in a way that would invalidate our preset weights. - # We should only update these numbers if we are updating a weights - # file, or have found a discrepancy with the upstream source. - expected_outputs = [-0.1116, -0.0375, -0.2624, 0.00891, -0.0061] - # Keep a high tolerance, so we are robust to different hardware. - self.assertAllClose(outputs, expected_outputs, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("gpt2_tokenizer", GPT2Tokenizer), - ("gpt2", GPT2Backbone), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("gpt2_tokenizer", GPT2Tokenizer), - ("gpt2", GPT2Backbone), - ) - def test_unknown_preset_error(self, cls): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("gpt2_base_en_clowntown") - - -@pytest.mark.extra_large -class GPT2PresetFullTest(TestCase): - """ - Test the full enumeration of our preset. - - This tests every GPT-2 preset and is only run manually. - Run with: - `pytest keras_nlp/models/gpt2/gpt2_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_gpt2(self, load_weights): - for preset in GPT2Backbone.presets: - model = GPT2Backbone.from_preset(preset, load_weights=load_weights) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 1024), - dtype="int64", - maxval=model.vocabulary_size, - ), - "padding_mask": ops.array([1] * 1024, shape=(1, 1024)), - } - model(input_data) - - def test_load_tokenizers(self): - for preset in GPT2Tokenizer.presets: - tokenizer = GPT2Tokenizer.from_preset(preset) - tokenizer("The quick brown fox.") diff --git a/keras_nlp/models/gpt2/gpt2_tokenizer.py b/keras_nlp/models/gpt2/gpt2_tokenizer.py index 9401b385d8..15b35bed87 100644 --- a/keras_nlp/models/gpt2/gpt2_tokenizer.py +++ b/keras_nlp/models/gpt2/gpt2_tokenizer.py @@ -70,32 +70,39 @@ class GPT2Tokenizer(BytePairTokenizer): def __init__( self, - vocabulary, - merges, + vocabulary=None, + merges=None, **kwargs, ): - # Special tokens. - end_token = "<|endoftext|>" + # GPT2 uses the same start as end token, i.e., "<|endoftext|>". + self.end_token = self.start_token = "<|endoftext|>" super().__init__( vocabulary=vocabulary, merges=merges, - unsplittable_tokens=[end_token], + unsplittable_tokens=[self.end_token], **kwargs, ) - # Check whether special tokens are present in the vocabulary. - if end_token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{end_token}'` in the provided " - f"`vocabulary`. Please provide `'{end_token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.end_token_id = self.token_to_id(end_token) - # GPT2 uses the same start as end token, i.e., "<|endoftext|>". - self.start_token_id = self.end_token_id - self.pad_token_id = 0 + def set_vocabulary_and_merges(self, vocabulary, merges): + super().set_vocabulary_and_merges(vocabulary, merges) + + if vocabulary is not None: + # Check for necessary special tokens. + if self.end_token not in self.get_vocabulary(): + raise ValueError( + f"Cannot find token `'{self.end_token}'` in the provided " + f"`vocabulary`. Please provide `'{self.end_token}'` in " + "your `vocabulary` or use a pretrained `vocabulary` name." + ) + + self.end_token_id = self.token_to_id(self.end_token) + self.start_token_id = self.end_token_id + self.pad_token_id = 0 + else: + self.end_token_id = None + self.start_token_id = None + self.pad_token_id = None @classproperty def presets(cls): diff --git a/keras_nlp/models/gpt2/gpt2_tokenizer_test.py b/keras_nlp/models/gpt2/gpt2_tokenizer_test.py index 38ed89f121..026392fd25 100644 --- a/keras_nlp/models/gpt2/gpt2_tokenizer_test.py +++ b/keras_nlp/models/gpt2/gpt2_tokenizer_test.py @@ -12,82 +12,52 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend import keras +import pytest + from keras_nlp.models.gpt2.gpt2_tokenizer import GPT2Tokenizer from keras_nlp.tests.test_case import TestCase class GPT2TokenizerTest(TestCase): def setUp(self): - self.vocab = { - "<|endoftext|>": 0, - "Ġair": 1, - "plane": 2, - "Ġat": 3, - "port": 4, - "Ġkoh": 5, - "li": 6, - "Ġis": 7, - "Ġthe": 8, - "Ġbest": 9, - } - self.merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - self.merges += [ - "Ġa t", - "p o", - "r t", - "o h", - "l i", - "Ġi s", - "Ġb e", - "s t", - ] - self.merges += [ - "Ġt h", - "Ġai r", - "pl a", - "Ġk oh", - "Ġth e", - "Ġbe st", - "po rt", + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["<|endoftext|>"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} + self.input_data = [ + " airplane at airport<|endoftext|>", + " airplane airport", ] - self.merges += ["pla ne"] - self.tokenizer = GPT2Tokenizer( - vocabulary=self.vocab, merges=self.merges + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=GPT2Tokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]], ) - def test_tokenize(self): - input_data = " airplane at airport" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [1, 2, 3, 1, 4]) - - def test_tokenize_end_token(self): - input_data = " airplane at airport<|endoftext|>" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [1, 2, 3, 1, 4, 0]) - - def test_tokenize_batch(self): - input_data = [" airplane at airport", " kohli is the best"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[1, 2, 3, 1, 4], [5, 6, 7, 8, 9]]) - - def test_detokenize(self): - input_tokens = [1, 2, 3, 1, 4] - output = self.tokenizer.detokenize(input_tokens) - self.assertEqual(output, " airplane at airport") - - def test_vocabulary_size(self): - self.assertEqual(self.tokenizer.vocabulary_size(), 10) - def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): GPT2Tokenizer(vocabulary=["a", "b", "c"], merges=[]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=GPT2Tokenizer, + preset="gpt2_base_en", + input_data=["The quick brown fox."], + expected_output=[[464, 2068, 7586, 21831, 13]], ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in GPT2Tokenizer.presets: + self.run_preset_test( + cls=GPT2Tokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone.py index 62dba1dd2d..5f86766433 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone.py @@ -12,18 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding from keras_nlp.models.backbone import Backbone from keras_nlp.models.gpt_neo_x.gpt_neo_x_decoder import GPTNeoXDecoder +from keras_nlp.utils.keras_utils import gelu_approximate def _gpt_neo_x_kernel_initializer(stddev=0.02): return keras.initializers.RandomNormal(stddev=stddev) -@keras_nlp_export("keras_nlp.models.GPTNeoXBackbone") +@keras.saving.register_keras_serializable(package="keras_nlp") class GPTNeoXBackbone(Backbone): """GPT-NeoX core network with hyperparameters. @@ -106,9 +106,7 @@ def __init__( rotary_percentage=rotary_percentage, rotary_max_wavelength=rotary_max_wavelength, layer_norm_epsilon=layer_norm_epsilon, - activation=lambda x: keras.activations.gelu( - x, approximate=True - ), + activation=gelu_approximate, kernel_initializer=_gpt_neo_x_kernel_initializer(stddev=0.02), name=f"transformer_layer_{i}", )(x, decoder_padding_mask=padding_mask) diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone_test.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone_test.py index 803006a06a..f207f4f19e 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone_test.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone_test.py @@ -12,100 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras -from keras_nlp.models import GPTNeoXBackbone +from keras_nlp.backend import ops +from keras_nlp.models.gpt_neo_x.gpt_neo_x_backbone import GPTNeoXBackbone from keras_nlp.tests.test_case import TestCase -class GPTNeoXTest(TestCase): +class GPTNeoXBackboneTest(TestCase): def setUp(self): - self.backbone = GPTNeoXBackbone( - vocabulary_size=10, - num_layers=4, - num_heads=4, - hidden_dim=64, - intermediate_dim=64, - max_sequence_length=10, - ) - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_call(self): - self.backbone(self.input_batch) - - def test_token_embedding(self): - output = self.backbone.token_embedding(self.input_batch["token_ids"]) - self.assertEqual(output.shape, (2, 5, 64)) - - def test_name(self): - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "gpt_neo_x_backbone") - - def test_variable_sequence_length(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "padding_mask": np.ones((2, seq_length), dtype="int32"), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=GPTNeoXBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 2), ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, GPTNeoXBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose(model_output, restored_output) - - -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class GPTNeoXBackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - GPTNeoXBackbone( - vocabulary_size=10, - num_layers=4, - num_heads=4, - hidden_dim=64, - intermediate_dim=64, - max_sequence_length=10, - ) - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.model.compile() - self.model.predict(self.input_dataset) + self.run_model_saving_test( + cls=GPTNeoXBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm.py index 0f813470aa..a11331176f 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.generative_task import GenerativeTask @@ -23,7 +22,7 @@ from keras_nlp.utils.python_utils import classproperty -@keras_nlp_export("keras_nlp.models.GPTNeoXCausalLM") +@keras.saving.register_keras_serializable(package="keras_nlp") class GPTNeoXCausalLM(GenerativeTask): """An end-to-end GPTNeoX model for causal language modeling. diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py index 3ed16c3ff9..665622540e 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py @@ -15,7 +15,7 @@ import tensorflow as tf from absl import logging -from keras_nlp.api_export import keras_nlp_export +from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.gpt_neo_x.gpt_neo_x_preprocessor import ( GPTNeoXPreprocessor, @@ -26,7 +26,7 @@ from keras_nlp.utils.keras_utils import pack_x_y_sample_weight -@keras_nlp_export("keras_nlp.models.GPTNeoXCausalLMPreprocessor") +@keras.saving.register_keras_serializable(package="keras_nlp") class GPTNeoXCausalLMPreprocessor(GPTNeoXPreprocessor): """GPT-NeoX Causal LM preprocessor. @@ -110,6 +110,9 @@ def generate_preprocess( the sequence (as generation is expected to continue at the end of the inputted prompt). """ + if not self.built: + self.build(None) + x = convert_inputs_to_list_of_tensor_segments(x)[0] x = self.tokenizer(x) token_ids, padding_mask = self.packer( @@ -130,6 +133,9 @@ def generate_postprocess( padding and start/end tokens, and then converting the integer sequence back to a string. """ + if not self.built: + self.build(None) + token_ids, padding_mask = x["token_ids"], x["padding_mask"] if not isinstance(token_ids, tf.Tensor): token_ids = ops.convert_to_numpy(token_ids) diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor_test.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor_test.py index e494b12b63..f5a7c57421 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor_test.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor_test.py @@ -14,7 +14,6 @@ import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.gpt_neo_x.gpt_neo_x_causal_lm_preprocessor import ( GPTNeoXCausalLMPreprocessor, ) @@ -24,55 +23,42 @@ class GPTNeoXCausalLMPreprocessorTest(TestCase): def setUp(self): - self.vocab = { - "!": 0, - "air": 1, - "Ġair": 2, - "plane": 3, - "Ġat": 4, - "port": 5, - "<|endoftext|>": 6, - } - + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["<|endoftext|>"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] self.merges += ["Ġai r", "Ġa i", "pla ne"] - - self.preprocessor = GPTNeoXCausalLMPreprocessor( - tokenizer=GPTNeoXTokenizer( - vocabulary=self.vocab, - merges=self.merges, + self.tokenizer = GPTNeoXTokenizer( + vocabulary=self.vocab, + merges=self.merges, + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ["airplane at airport"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=GPTNeoXCausalLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, + [[1, 3, 4, 2, 5, 6, 0, 0]], # Pass through labels. + [[1, 1, 1, 1, 1, 1, 0, 0]], # Pass through sample_weights. ), - sequence_length=8, ) - def test_strings(self): - input_data = "airplane at airport" - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 6, 0]) - self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - self.assertAllEqual(y, [1, 3, 4, 2, 5, 6, 0, 0]) - self.assertAllEqual(sw, [1, 1, 1, 1, 1, 1, 0, 0]) - - def test_list_of_strings(self): - input_data = ["airplane at airport"] * 4 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, [[1, 3, 4, 2, 5, 6, 0, 0]] * 4) - self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4) - def test_no_start_end_token(self): input_data = ["airplane at airport"] * 4 preprocessor = GPTNeoXCausalLMPreprocessor( - tokenizer=GPTNeoXTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, + **self.init_kwargs, add_start_token=False, add_end_token=False, ) @@ -82,29 +68,10 @@ def test_no_start_end_token(self): self.assertAllEqual(y, [[3, 4, 2, 5, 0, 0, 0, 0]] * 4) self.assertAllEqual(sw, [[1, 1, 1, 1, 0, 0, 0, 0]] * 4) - def test_labeled_batch(self): - x = tf.constant(["airplane at airport"] * 4) - y = tf.constant([1] * 4) # Ignored. - sw = tf.constant([1.0] * 4) # Ignored. - x, y, sw = self.preprocessor(x, y, sw) - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, [[1, 3, 4, 2, 5, 6, 0, 0]] * 4) - self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4) - - def test_dataset(self): - x = tf.constant(["airplane at airport"] * 4) - ds = tf.data.Dataset.from_tensor_slices(x) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(4).take(1).get_single_element() - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, [[1, 3, 4, 2, 5, 6, 0, 0]] * 4) - self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4) - def test_generate_preprocess(self): input_data = "airplane at airport" - x = self.preprocessor.generate_preprocess(input_data) + preprocessor = GPTNeoXCausalLMPreprocessor(**self.init_kwargs) + x = preprocessor.generate_preprocess(input_data) self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 0, 0]) self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0]) @@ -113,13 +80,6 @@ def test_generate_postprocess(self): "token_ids": tf.constant([6, 1, 3, 4, 2, 5, 0, 0]), "padding_mask": tf.cast([1, 1, 1, 1, 1, 1, 0, 0], dtype="bool"), } - x = self.preprocessor.generate_postprocess(input_data) + preprocessor = GPTNeoXCausalLMPreprocessor(**self.init_kwargs) + x = preprocessor.generate_postprocess(input_data) self.assertAllEqual(x, "airplane at airport") - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_test.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_test.py index dda9f51bdd..c8839c8be9 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_test.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm_test.py @@ -12,13 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from unittest.mock import patch import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.gpt_neo_x.gpt_neo_x_backbone import GPTNeoXBackbone from keras_nlp.models.gpt_neo_x.gpt_neo_x_causal_lm import GPTNeoXCausalLM @@ -31,15 +28,9 @@ class GPTNeoXCausalLMTest(TestCase): def setUp(self): - self.vocab = { - "!": 0, - "air": 1, - "Ġair": 2, - "plane": 3, - "Ġat": 4, - "port": 5, - "<|endoftext|>": 6, - } + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["<|endoftext|>"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] self.merges += ["Ġai r", "Ġa i", "pla ne"] @@ -50,72 +41,49 @@ def setUp(self): self.backbone = GPTNeoXBackbone( vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(), num_layers=2, - num_heads=4, - hidden_dim=32, - intermediate_dim=32, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.causal_lm = GPTNeoXCausalLM( - backbone=self.backbone, - preprocessor=self.preprocessor, + num_heads=2, + hidden_dim=4, + intermediate_dim=8, + max_sequence_length=self.preprocessor.sequence_length, ) - - self.raw_batch = [ - " airplane at airport", - " airplane at airport", - ] - - self.preprocessed_batch = self.preprocessor(self.raw_batch)[0] - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_causal_lm(self): - self.causal_lm(self.preprocessed_batch) - - def test_predict(self): - self.causal_lm.predict(self.raw_batch) - self.causal_lm.preprocessor = None - self.causal_lm.predict(self.preprocessed_batch) - - def test_fit(self): - self.causal_lm.fit(self.raw_dataset) - self.causal_lm.preprocessor = None - self.causal_lm.fit(self.preprocessed_dataset) - - def test_fit_no_xla(self): - self.causal_lm.preprocessor = None - self.causal_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ([" airplane at airport", " airplane at airport"],) + self.input_data = self.preprocessor(*self.train_data)[0] + + def test_causal_lm_basics(self): + self.run_task_test( + cls=GPTNeoXCausalLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 8, 7), ) - self.causal_lm.fit(self.preprocessed_dataset) def test_generate(self): + causal_lm = GPTNeoXCausalLM(**self.init_kwargs) # String input. prompt = " airplane at airport" - output = self.causal_lm.generate(" airplane at airport") + output = causal_lm.generate(" airplane at airport") self.assertTrue(prompt in output) - # String tensor input. - self.assertIsInstance(self.causal_lm.generate(self.raw_batch)[0], str) - # String dataset input. - self.assertIsInstance(self.causal_lm.generate(self.raw_dataset)[0], str) # Int tensor input. - self.causal_lm.preprocessor = None - outputs = self.causal_lm.generate(self.preprocessed_batch) + prompt_ids = self.preprocessor.generate_preprocess([prompt]) + causal_lm.preprocessor = None + outputs = causal_lm.generate(prompt_ids) # Assert prompt is in output in token id space. self.assertAllEqual( outputs["token_ids"][:, :5], - self.preprocessed_batch["token_ids"][:, :5], + prompt_ids["token_ids"][:, :5], ) self.assertAllEqual( outputs["padding_mask"][:, :5], - self.preprocessed_batch["padding_mask"][:, :5], + prompt_ids["padding_mask"][:, :5], ) def test_early_stopping(self): - call_with_cache = self.causal_lm.call_with_cache + causal_lm = GPTNeoXCausalLM(**self.init_kwargs) + call_with_cache = causal_lm.call_with_cache def wrapper(*args, **kwargs): """Modify output logits to always favor end_token_id""" @@ -126,43 +94,28 @@ def wrapper(*args, **kwargs): logits = ops.slice_update(logits, (0, 0, index), update) return logits, hidden_states, cache - with patch.object(self.causal_lm, "call_with_cache", wraps=wrapper): + with patch.object(causal_lm, "call_with_cache", wraps=wrapper): prompt = [" airplane at airport", " airplane"] - output = self.causal_lm.generate(prompt) + output = causal_lm.generate(prompt) # We should immediately abort and output the prompt. self.assertEqual(prompt, output) def test_generate_compilation(self): + causal_lm = GPTNeoXCausalLM(**self.init_kwargs) # Assert we do not recompile with successive calls. - self.causal_lm.generate(self.raw_batch) - first_fn = self.causal_lm.generate_function - self.causal_lm.generate(self.raw_batch) - second_fn = self.causal_lm.generate_function + causal_lm.generate(" airplane at airport") + first_fn = causal_lm.generate_function + causal_lm.generate(" airplane at airport") + second_fn = causal_lm.generate_function self.assertEqual(first_fn, second_fn) # Assert we do recompile after compile is called. - self.causal_lm.compile(sampler="greedy") - self.assertIsNone(self.causal_lm.generate_function) - - def test_serialization(self): - new_causal_lm = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.causal_lm) - ) - self.assertEqual( - new_causal_lm.get_config(), self.causal_lm.get_config() - ) + causal_lm.compile(sampler="greedy") + self.assertIsNone(causal_lm.generate_function) @pytest.mark.large def test_saved_model(self): - keras.utils.set_random_seed(42) - model_output = self.causal_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.causal_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, GPTNeoXCausalLM) - - # Check that output matches. - keras.utils.set_random_seed(42) - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + self.run_model_saving_test( + cls=GPTNeoXCausalLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_preprocessor.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_preprocessor.py index 0dad9a053a..8f0d5731aa 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_preprocessor.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_preprocessor.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.api_export import keras_nlp_export +from keras_nlp.backend import keras from keras_nlp.layers.preprocessing.start_end_packer import StartEndPacker from keras_nlp.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer from keras_nlp.models.preprocessor import Preprocessor @@ -23,7 +23,7 @@ from keras_nlp.utils.python_utils import classproperty -@keras_nlp_export("keras_nlp.models.GPTNeoXPreprocessor") +@keras.saving.register_keras_serializable(package="keras_nlp") class GPTNeoXPreprocessor(Preprocessor): """GPTNeoX preprocessing layer which tokenizes and packs inputs. @@ -79,24 +79,19 @@ def __init__( self.sequence_length = sequence_length self.add_start_token = add_start_token self.add_end_token = add_end_token + self.packer = None + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = StartEndPacker( - start_value=tokenizer.start_token_id, - end_value=tokenizer.end_token_id, - pad_value=tokenizer.pad_token_id, - sequence_length=sequence_length, + start_value=self.tokenizer.start_token_id, + end_value=self.tokenizer.end_token_id, + pad_value=self.tokenizer.pad_token_id, + sequence_length=self.sequence_length, return_padding_mask=True, ) - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config + self.built = True def call( self, @@ -126,6 +121,17 @@ def call( } return pack_x_y_sample_weight(x, y, sample_weight) + def get_config(self): + config = super().get_config() + config.update( + { + "sequence_length": self.sequence_length, + "add_start_token": self.add_start_token, + "add_end_token": self.add_end_token, + } + ) + return config + @classproperty def tokenizer_cls(cls): return GPTNeoXTokenizer diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py index 53655822a0..c87329af4a 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py @@ -12,9 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf - -from keras_nlp.backend import keras from keras_nlp.models.gpt_neo_x.gpt_neo_x_preprocessor import ( GPTNeoXPreprocessor, ) @@ -24,41 +21,32 @@ class GPTNeoXPreprocessorTest(TestCase): def setUp(self): - self.vocab = { - "!": 0, - "air": 1, - "Ġair": 2, - "plane": 3, - "Ġat": 4, - "port": 5, - "<|endoftext|>": 6, - } - + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["<|endoftext|>"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] self.merges += ["Ġai r", "Ġa i", "pla ne"] - - self.preprocessor = GPTNeoXPreprocessor( - tokenizer=GPTNeoXTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, + self.tokenizer = GPTNeoXTokenizer( + vocabulary=self.vocab, + merges=self.merges, + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ["airplane at airport"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=GPTNeoXPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output={ + "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, ) - - def test_tokenize_strings(self): - input_data = "airplane at airport" - - x = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 6, 0]) - self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - - def test_tokenize_list_of_strings(self): - input_data = ["airplane at airport"] * 4 - - x = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) def test_no_start_end_token(self): input_data = ["airplane at airport"] * 4 @@ -76,33 +64,8 @@ def test_no_start_end_token(self): self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4) self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4) - def test_tokenize_labeled_batch(self): - x = tf.constant(["airplane at airport"] * 4) - y_in = tf.constant([1] * 4) - sw_in = tf.constant([1.0] * 4) - x, y, sw = self.preprocessor(x, y_in, sw_in) - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, y_in) - self.assertAllEqual(sw, sw_in) - - def test_tokenize_labeled_dataset(self): - x = tf.constant(["airplane at airport"] * 4) - ds = tf.data.Dataset.from_tensor_slices(x) - ds = ds.map(self.preprocessor) - x = ds.batch(4).take(1).get_single_element() - self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - def test_sequence_length_override(self): input_data = "airplane at airport" - x = self.preprocessor(input_data, sequence_length=4) + preprocessor = GPTNeoXPreprocessor(**self.init_kwargs) + x = preprocessor(input_data, sequence_length=4) self.assertAllEqual(x["token_ids"], [6, 1, 3, 6]) - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer.py index 3935d85a65..cc63e99af6 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.api_export import keras_nlp_export +from keras_nlp.backend import keras from keras_nlp.tokenizers.byte_pair_tokenizer import BytePairTokenizer -@keras_nlp_export("keras_nlp.models.GPTNeoXTokenizer") +@keras.saving.register_keras_serializable(package="keras_nlp") class GPTNeoXTokenizer(BytePairTokenizer): """A GPTNeoX tokenizer using Byte-Pair Encoding subword segmentation. @@ -45,32 +45,39 @@ class GPTNeoXTokenizer(BytePairTokenizer): def __init__( self, - vocabulary, - merges, + vocabulary=None, + merges=None, **kwargs, ): - # Special tokens. - end_token = "<|endoftext|>" + # GPTNeoX uses the same start as end token, i.e., "<|endoftext|>". + self.end_token = self.start_token = "<|endoftext|>" super().__init__( vocabulary=vocabulary, merges=merges, - unsplittable_tokens=[end_token], + unsplittable_tokens=[self.end_token], **kwargs, ) - # Check whether special tokens are present in the vocabulary. - if end_token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{end_token}'` in the provided " - f"`vocabulary`. Please provide `'{end_token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) + def set_vocabulary_and_merges(self, vocabulary, merges): + super().set_vocabulary_and_merges(vocabulary, merges) - self.end_token_id = self.token_to_id(end_token) - # GPTNeoX uses the same start as end token, i.e., "<|endoftext|>". - self.start_token_id = self.end_token_id - self.pad_token_id = 0 + if vocabulary is not None: + # Check for necessary special tokens. + if self.end_token not in self.get_vocabulary(): + raise ValueError( + f"Cannot find token `'{self.end_token}'` in the provided " + f"`vocabulary`. Please provide `'{self.end_token}'` in " + "your `vocabulary` or use a pretrained `vocabulary` name." + ) + + self.end_token_id = self.token_to_id(self.end_token) + self.start_token_id = self.end_token_id + self.pad_token_id = 0 + else: + self.end_token_id = None + self.start_token_id = None + self.pad_token_id = None def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer_test.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer_test.py index da6daa2c24..c23b7dd44d 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer_test.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer_test.py @@ -12,82 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend import keras from keras_nlp.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer from keras_nlp.tests.test_case import TestCase class GPTNeoXTokenizerTest(TestCase): def setUp(self): - self.vocab = { - "<|endoftext|>": 0, - "Ġair": 1, - "plane": 2, - "Ġat": 3, - "port": 4, - "Ġkoh": 5, - "li": 6, - "Ġis": 7, - "Ġthe": 8, - "Ġbest": 9, - } - self.merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - self.merges += [ - "Ġa t", - "p o", - "r t", - "o h", - "l i", - "Ġi s", - "Ġb e", - "s t", + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["<|endoftext|>"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} + self.input_data = [ + " airplane at airport<|endoftext|>", + " airplane airport", ] - self.merges += [ - "Ġt h", - "Ġai r", - "pl a", - "Ġk oh", - "Ġth e", - "Ġbe st", - "po rt", - ] - self.merges += ["pla ne"] - self.tokenizer = GPTNeoXTokenizer( - vocabulary=self.vocab, merges=self.merges + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=GPTNeoXTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]], ) - def test_tokenize(self): - input_data = " airplane at airport" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [1, 2, 3, 1, 4]) - - def test_tokenize_end_token(self): - input_data = " airplane at airport<|endoftext|>" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [1, 2, 3, 1, 4, 0]) - - def test_tokenize_batch(self): - input_data = [" airplane at airport", " kohli is the best"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[1, 2, 3, 1, 4], [5, 6, 7, 8, 9]]) - - def test_detokenize(self): - input_tokens = [1, 2, 3, 1, 4] - output = self.tokenizer.detokenize(input_tokens) - self.assertEqual(output, " airplane at airport") - - def test_vocabulary_size(self): - self.assertEqual(self.tokenizer.vocabulary_size(), 10) - def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): GPTNeoXTokenizer(vocabulary=["a", "b", "c"], merges=[]) - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), - ) diff --git a/keras_nlp/models/llama/__init__.py b/keras_nlp/models/llama/__init__.py new file mode 100644 index 0000000000..ba0c2545e4 --- /dev/null +++ b/keras_nlp/models/llama/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/keras_nlp/models/llama/llama_attention.py b/keras_nlp/models/llama/llama_attention.py new file mode 100644 index 0000000000..a2604e5351 --- /dev/null +++ b/keras_nlp/models/llama/llama_attention.py @@ -0,0 +1,201 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from keras_nlp.backend import keras +from keras_nlp.backend import ops +from keras_nlp.layers.modeling.rotary_embedding import RotaryEmbedding +from keras_nlp.utils.keras_utils import clone_initializer + + +class LlamaAttention(keras.layers.Layer): + """Grouped query attention for Llama models""" + + def __init__( + self, + num_query_heads, + num_key_value_heads, + rope_scaling_factor=1.0, + kernel_initializer="glorot_uniform", + rope_max_wavelength=10000, + max_sequence_length=512, + **kwargs, + ): + super().__init__(**kwargs) + self.num_query_heads = num_query_heads + self.num_key_value_heads = num_key_value_heads + + self.num_key_value_groups = num_query_heads // num_key_value_heads + + self.kernel_initializer = keras.initializers.get(kernel_initializer) + self.max_sequence_length = max_sequence_length + + self.rope_scaling_factor = rope_scaling_factor + self.rope_max_wavelength = rope_max_wavelength + + def build(self, inputs_shape): + self.hidden_dim = inputs_shape[-1] + self.attn_head_size = self.hidden_dim // self.num_query_heads + + # Einsum variables: + # b = batch size + # q = query length + # k = key/value length + # m = model dim + # u = num query heads + # v = num key/value heads + # h = head dim + self._query_dense = keras.layers.EinsumDense( + equation="bqm,muh->bquh", + output_shape=(None, self.num_query_heads, self.attn_head_size), + kernel_initializer=clone_initializer(self.kernel_initializer), + name="query", + ) + self._query_dense.build(inputs_shape) + self._key_dense = keras.layers.EinsumDense( + equation="bkm,mvh->bkvh", + output_shape=(None, self.num_key_value_heads, self.attn_head_size), + kernel_initializer=clone_initializer(self.kernel_initializer), + name="key", + ) + self._key_dense.build(inputs_shape) + + self._value_dense = keras.layers.EinsumDense( + equation="bkm,mvh->bkvh", + output_shape=(None, self.num_key_value_heads, self.attn_head_size), + kernel_initializer=clone_initializer(self.kernel_initializer), + name="value", + ) + self._value_dense.build(inputs_shape) + + self._softmax = keras.layers.Softmax(axis=-1, name="attention_softmax") + + self._output_dense = keras.layers.EinsumDense( + equation="bqm,mh->bqh", + output_shape=(None, self.hidden_dim), + kernel_initializer=clone_initializer(self.kernel_initializer), + name="attention_output", + ) + self._output_dense.build(inputs_shape) + + self._rotary_embedding_layer = RotaryEmbedding( + max_wavelength=self.rope_max_wavelength, + scaling_factor=self.rope_scaling_factor, + ) + self._rotary_embedding_layer.build(inputs_shape) + + self.built = True + + def call( + self, + hidden_states, + attention_mask=None, + cache=None, + cache_update_index=None, + ): + query = self._query_dense(hidden_states) + + if cache is not None: + key_cache = cache[:, 0, ...] + value_cache = cache[:, 1, ...] + if cache_update_index is None: + key = key_cache + value = value_cache + else: + key_update = self._key_dense(hidden_states) + value_update = self._value_dense(hidden_states) + start = [0, cache_update_index, 0, 0] + key = ops.slice_update(key_cache, start, key_update) + value = ops.slice_update(value_cache, start, value_update) + cache = ops.stack((key, value), axis=1) + else: + if cache_update_index is not None: + raise ValueError( + "`cache_update_index` should not be set if `cache` is " + f"`None`. Received: cache={cache}, " + f"cache_update_index={cache_update_index}" + ) + key = self._key_dense(hidden_states) + value = self._value_dense(hidden_states) + + query = self._rotary_embedding_layer(query) + key = self._rotary_embedding_layer(key) + + key = ops.tile(key, [1, 1, self.num_key_value_groups, 1]) + value = ops.tile(value, [1, 1, self.num_key_value_groups, 1]) + + attention_output, attention_scores = self._compute_attention( + query, key, value, attention_mask + ) + + attention_output_shape = ops.shape(attention_output) + + attention_output = ops.reshape( + attention_output, + [ + attention_output_shape[0], + attention_output_shape[1], + self.hidden_dim, + ], + ) + + attention_output = self._output_dense(attention_output) + + if cache is not None: + return (attention_output, cache) + return attention_output + + def _masked_softmax(self, attention_scores, attention_mask=None): + if attention_mask is not None: + mask_expansion_axis = -3 + for _ in range( + len(attention_scores.shape) - len(attention_mask.shape) + ): + attention_mask = ops.expand_dims( + attention_mask, axis=mask_expansion_axis + ) + return self._softmax(attention_scores, attention_mask) + + def _compute_attention(self, query, key, value, attention_mask=None): + attention_scores = ops.einsum("aecd,abcd->acbe", key, query) + + norm_factor = ops.sqrt( + ops.convert_to_tensor(self.attn_head_size, self.compute_dtype) + ) + + attention_scores /= norm_factor + + attention_scores = self._masked_softmax( + attention_scores, attention_mask + ) + attention_output = ops.einsum( + "acbe,aecd->abcd", attention_scores, value + ) + + return attention_output, attention_scores + + def get_config(self): + config = super().get_config() + config.update( + { + "num_query_heads": self.num_query_heads, + "hidden_dim": self.hidden_dim, + "kernel_initializer": keras.initializers.serialize( + self.kernel_initializer + ), + "rope_max_wavelength": self.rope_max_wavelength, + "rope_scaling_factor": self.rope_scaling_factor, + "num_key_value_heads": self.num_key_value_heads, + "max_sequence_length": self.max_sequence_length, + } + ) + return config diff --git a/keras_nlp/models/llama/llama_backbone.py b/keras_nlp/models/llama/llama_backbone.py new file mode 100644 index 0000000000..8e501d9ee8 --- /dev/null +++ b/keras_nlp/models/llama/llama_backbone.py @@ -0,0 +1,155 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from keras_nlp.backend import keras +from keras_nlp.backend import ops +from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding +from keras_nlp.models.backbone import Backbone +from keras_nlp.models.llama.llama_decoder import LlamaDecoder +from keras_nlp.models.llama.llama_layernorm import LlamaLayerNorm + + +def _llama_kernel_initializer(stddev=0.02): + return keras.initializers.RandomNormal(stddev=stddev) + + +@keras.saving.register_keras_serializable(package="keras_nlp") +class LlamaBackbone(Backbone): + """ + LLaMA core network with hyperparameters. + + This network implements a Transformer-based decoder network, + LLaMA, as described in ["LLaMA: Open Foundation and Fine-Tuned Language Models"](https://arxiv.org/abs/2302.13971). + + The default constructor gives a fully customizable, randomly initialized + LLaMA model with any number of layers, heads, and embedding + dimensions. This backbone also supports LLaMA2 checkpoints. + + Args: + vocabulary_size: int. The size of the token vocabulary. + num_layers: int. The number of transformer layers. + num_query_heads: int. The number of attention heads for each transformer. + The hidden size must be divisible by the number of attention heads. + hidden_dim: int. The size of the transformer encoding and pooler layers. + intermediate_dim: int. The output dimension of the first Dense layer in + a two-layer feedforward network for each transformer. + num_key_value_heads: int. This is the number of key_value heads that + should be used to implement Grouped Query Attention. If num_key_value_heads=num_attention_heads, + the model will use Multi Head Attention (MHA), if num_key_value_heads=1 + the model will use Multi Query Attention (MQA) + rope_scaling_factor: float. The scaling factor for calculation of rotary + embedding + rope_max_wavelength: int. The maximum angular wavelength of the + sine/cosine curves, for rotary embeddings. + layer_norm_epsilon: float. a value added to the denominator for + numerical stability. + max_sequence_length: int. The maximum sequence length that this encoder + can consume. If `None`, `max_sequence_length` uses the value from + sequence length. This determines the variable shape for positional + embeddings. + + """ + + def __init__( + self, + vocabulary_size, + num_layers, + num_query_heads, + hidden_dim, + intermediate_dim, + num_key_value_heads, + rope_scaling_factor=1.0, + rope_max_wavelength=10000, + layer_norm_epsilon=1e-5, + max_sequence_length=4096, + **kwargs, + ): + # Inputs + token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids") + padding_mask = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + + # Embed tokens + token_embedding = ReversibleEmbedding( + input_dim=vocabulary_size, + output_dim=hidden_dim, + embeddings_initializer=_llama_kernel_initializer(stddev=0.01), + tie_weights=False, + name="token_embedding", + )(token_ids) + + x = token_embedding + + # Apply successive transformer decoder blocks. + for i in range(num_layers): + x = LlamaDecoder( + intermediate_dim=intermediate_dim, + num_query_heads=num_query_heads, + num_key_value_heads=num_key_value_heads, + rope_scaling_factor=rope_scaling_factor, + max_sequence_length=max_sequence_length, + rope_max_wavelength=rope_max_wavelength, + layer_norm_epsilon=layer_norm_epsilon, + activation=ops.silu, + kernel_initializer=_llama_kernel_initializer(stddev=0.02), + name=f"transformer_layer_{i}", + )(x, decoder_padding_mask=padding_mask) + + sequence_output = LlamaLayerNorm( + name="layer_norm", + epsilon=layer_norm_epsilon, + )(x) + + # Instantiate using Functional API Model constructor + super().__init__( + inputs={ + "token_ids": token_ids, + "padding_mask": padding_mask, + }, + outputs=sequence_output, + **kwargs, + ) + # All references to `self` below this line + self.vocabulary_size = vocabulary_size + self.num_layers = num_layers + self.num_query_heads = num_query_heads + self.hidden_dim = hidden_dim + self.intermediate_dim = intermediate_dim + self.rope_max_wavelength = rope_max_wavelength + self.num_key_value_heads = num_key_value_heads + self.rope_scaling_factor = rope_scaling_factor + self.max_sequence_length = max_sequence_length + self.layer_norm_epsilon = layer_norm_epsilon + + def get_config(self): + config = super().get_config() + config.update( + { + "vocabulary_size": self.vocabulary_size, + "num_layers": self.num_layers, + "num_query_heads": self.num_query_heads, + "hidden_dim": self.hidden_dim, + "intermediate_dim": self.intermediate_dim, + "rope_max_wavelength": self.rope_max_wavelength, + "rope_scaling_factor": self.rope_scaling_factor, + "num_key_value_heads": self.num_key_value_heads, + "max_sequence_length": self.max_sequence_length, + "layer_norm_epsilon": self.layer_norm_epsilon, + } + ) + return config + + @property + def token_embedding(self): + return self.get_layer("token_embedding") diff --git a/keras_nlp/models/llama/llama_backbone_test.py b/keras_nlp/models/llama/llama_backbone_test.py new file mode 100644 index 0000000000..efff972c6b --- /dev/null +++ b/keras_nlp/models/llama/llama_backbone_test.py @@ -0,0 +1,52 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from keras_nlp.backend import ops +from keras_nlp.models.llama.llama_backbone import LlamaBackbone +from keras_nlp.tests.test_case import TestCase + + +class LlamaTest(TestCase): + def setUp(self): + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_query_heads": 4, + "num_key_value_heads": 2, + "hidden_dim": 8, + "intermediate_dim": 8, + "max_sequence_length": 10, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), + } + + def test_backbone_basics(self): + self.run_backbone_test( + cls=LlamaBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 8), + ) + + @pytest.mark.large + def test_saved_model(self): + self.run_model_saving_test( + cls=LlamaBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/llama/llama_decoder.py b/keras_nlp/models/llama/llama_decoder.py new file mode 100644 index 0000000000..47bac478cc --- /dev/null +++ b/keras_nlp/models/llama/llama_decoder.py @@ -0,0 +1,206 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from keras_nlp.backend import keras +from keras_nlp.backend import ops +from keras_nlp.layers.modeling.transformer_layer_utils import ( + compute_causal_mask, +) +from keras_nlp.layers.modeling.transformer_layer_utils import ( + merge_padding_and_attention_mask, +) +from keras_nlp.models.llama.llama_attention import LlamaAttention +from keras_nlp.models.llama.llama_layernorm import LlamaLayerNorm +from keras_nlp.utils.keras_utils import clone_initializer + + +class LlamaDecoder(keras.layers.Layer): + """Llama decoder block.""" + + def __init__( + self, + intermediate_dim, + num_query_heads, + num_key_value_heads, + rope_scaling_factor=1.0, + activation="relu", + layer_norm_epsilon=1e-5, + kernel_initializer="glorot_uniform", + rope_max_wavelength=10000, + max_sequence_length=512, + **kwargs, + ): + super().__init__(**kwargs) + self.intermediate_dim = intermediate_dim + self.num_query_heads = num_query_heads + self.num_key_value_heads = num_key_value_heads + + self.rope_max_wavelength = rope_max_wavelength + self.rope_scaling_factor = rope_scaling_factor + + self.max_sequence_length = max_sequence_length + self.activation = keras.activations.get(activation) + self.layer_norm_epsilon = layer_norm_epsilon + self.kernel_initializer = keras.initializers.get(kernel_initializer) + + def build(self, decoder_sequence_shape): + self.hidden_dim = decoder_sequence_shape[-1] + + # Self attention layers. + self._self_attention_layer = LlamaAttention( + num_query_heads=self.num_query_heads, + num_key_value_heads=self.num_key_value_heads, + rope_max_wavelength=self.rope_max_wavelength, + max_sequence_length=self.max_sequence_length, + rope_scaling_factor=self.rope_scaling_factor, + kernel_initializer=clone_initializer(self.kernel_initializer), + ) + self._self_attention_layer.build(decoder_sequence_shape) + + self._self_attention_layernorm = LlamaLayerNorm( + epsilon=self.layer_norm_epsilon, + ) + self._self_attention_layernorm.build(decoder_sequence_shape) + + # Feedforward layers. + self._feedforward_intermediate_dense = keras.layers.Dense( + self.intermediate_dim, + kernel_initializer=clone_initializer(self.kernel_initializer), + ) + self._feedforward_intermediate_dense.build(decoder_sequence_shape) + + self._feedforward_gate_dense = keras.layers.Dense( + self.intermediate_dim, + activation=self.activation, + kernel_initializer=clone_initializer(self.kernel_initializer), + ) + self._feedforward_gate_dense.build(decoder_sequence_shape) + + self._feedforward_output_dense = keras.layers.Dense( + self.hidden_dim, + kernel_initializer=clone_initializer(self.kernel_initializer), + ) + + intermediate_shape = list(decoder_sequence_shape) + intermediate_shape[-1] = self.intermediate_dim + self._feedforward_output_dense.build(tuple(intermediate_shape)) + + self._feedforward_layernorm = LlamaLayerNorm( + epsilon=self.layer_norm_epsilon, + ) + self._feedforward_layernorm.build(decoder_sequence_shape) + + self.built = True + + def call( + self, + decoder_sequence, + decoder_padding_mask=None, + decoder_attention_mask=None, + self_attention_cache=None, + self_attention_cache_update_index=None, + ): + self_attention_mask = self._compute_self_attention_mask( + decoder_sequence=decoder_sequence, + decoder_padding_mask=decoder_padding_mask, + decoder_attention_mask=decoder_attention_mask, + self_attention_cache=self_attention_cache, + self_attention_cache_update_index=self_attention_cache_update_index, + ) + residual = decoder_sequence + + x = self._self_attention_layernorm( + decoder_sequence, + ) + + x = self._self_attention_layer( + hidden_states=x, + attention_mask=self_attention_mask, + cache=self_attention_cache, + cache_update_index=self_attention_cache_update_index, + ) + + if self_attention_cache is not None: + x, self_attention_cache = x + + x = x + residual + residual = x + + x = self._feedforward_layernorm(x) + gate_output = self._feedforward_gate_dense(x) + + x = self._feedforward_intermediate_dense(x) + + x = self._feedforward_output_dense(ops.multiply(x, gate_output)) + + decoder_output = x + residual + + if self_attention_cache is not None: + return (decoder_output, self_attention_cache) + return decoder_output + + def _compute_self_attention_mask( + self, + decoder_sequence, + decoder_padding_mask, + decoder_attention_mask, + self_attention_cache=None, + self_attention_cache_update_index=None, + ): + decoder_mask = merge_padding_and_attention_mask( + decoder_sequence, decoder_padding_mask, decoder_attention_mask + ) + batch_size = ops.shape(decoder_sequence)[0] + input_length = output_length = ops.shape(decoder_sequence)[1] + # We need to handle a rectangular causal mask when doing cached + # decoding. For generative inference, `decoder_sequence` will + # generally be length 1, and `cache` will be the full generation length. + if self_attention_cache is not None: + input_length = ops.shape(self_attention_cache)[2] + + causal_mask = compute_causal_mask( + batch_size, + input_length, + output_length, + 0 + if self_attention_cache_update_index is None + else self_attention_cache_update_index, + ) + return ( + ops.minimum(decoder_mask, causal_mask) + if decoder_mask is not None + else causal_mask + ) + + def compute_output_shape(self, decoder_sequence_shape): + return decoder_sequence_shape + + def get_config(self): + config = super().get_config() + config.update( + { + "intermediate_dim": self.intermediate_dim, + "hidden_dim": self.hidden_dim, + "num_query_heads": self.num_query_heads, + "rope_max_wavelength": self.rope_max_wavelength, + "rope_scaling_factor": self.rope_scaling_factor, + "num_key_value_heads": self.num_key_value_heads, + "max_sequence_length": self.max_sequence_length, + "activation": keras.activations.serialize(self.activation), + "layer_norm_epsilon": self.layer_norm_epsilon, + "kernel_initializer": keras.initializers.serialize( + self.kernel_initializer + ), + } + ) + return config diff --git a/keras_nlp/models/llama/llama_layernorm.py b/keras_nlp/models/llama/llama_layernorm.py new file mode 100644 index 0000000000..0e85a45625 --- /dev/null +++ b/keras_nlp/models/llama/llama_layernorm.py @@ -0,0 +1,37 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from keras_nlp.backend import keras +from keras_nlp.backend import ops + +# TODO: Should be replaced with LayerNormalization with `rms_scaling` param +# https://github.com/keras-team/keras-core/pull/726 + + +class LlamaLayerNorm(keras.layers.Layer): + def __init__(self, epsilon=1e-6, **kwargs): + super().__init__(**kwargs) + self.epsilon = epsilon + + def build(self, input_shape): + self.weight = self.add_weight( + name="weight", + shape=(input_shape[-1],), + initializer="ones", + ) + self.built = True + + def call(self, hidden_states): + variance = ops.mean(ops.square(hidden_states), axis=-1, keepdims=True) + hidden_states = hidden_states * 1 / ops.sqrt(variance + self.epsilon) + return self.weight * hidden_states diff --git a/keras_nlp/models/mistral/__init__.py b/keras_nlp/models/mistral/__init__.py new file mode 100644 index 0000000000..ba0c2545e4 --- /dev/null +++ b/keras_nlp/models/mistral/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/keras_nlp/models/mistral/mistral_attention.py b/keras_nlp/models/mistral/mistral_attention.py new file mode 100644 index 0000000000..680f1f6d1b --- /dev/null +++ b/keras_nlp/models/mistral/mistral_attention.py @@ -0,0 +1,293 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from keras_nlp.backend import keras +from keras_nlp.backend import ops +from keras_nlp.layers.modeling.rotary_embedding import RotaryEmbedding +from keras_nlp.utils.keras_utils import clone_initializer + + +# This is just a self-attention layer in Mistral. But it can be generalized +# to use the `keras_nlp.layers.CachedMultiHeadAttention` API. Since this layer +# implements grouped-query attention and sliding window attention, it might be +# useful outside of Mistral itself. +# TODO(tirthasheshpatel): Generalize the attention layer +# TODO(tirthasheshpatel): Merge `LlamaAttention` with this layer +# TODO(tirthasheshpatel): Use flash attention +class CachedMistralAttention(keras.layers.Layer): + """A cached grounded query attention layer with sliding window.""" + + def __init__( + self, + num_query_heads, + num_key_value_heads, + rope_max_wavelength=10000, + rope_scaling_factor=1.0, + kernel_initializer="glorot_uniform", + sliding_window=512, + dropout=0, + **kwargs, + ): + super().__init__(**kwargs) + self._num_query_heads = num_query_heads + self._num_key_value_heads = num_key_value_heads + self._sliding_window = sliding_window + self._dropout = dropout + + self._num_key_value_groups = num_query_heads // num_key_value_heads + self._rope_max_wavelength = rope_max_wavelength + + self._kernel_initializer = keras.initializers.get( + clone_initializer(kernel_initializer) + ) + + self._rope_scaling_factor = rope_scaling_factor + + def build(self, inputs_shape): + # Einsum variables: + # b = batch size + # q = query length + # k = key/value length + # m = model dim + # u = num query heads + # v = num key/value heads + # h = head dim + self._hidden_dim = inputs_shape[-1] + self._head_dim = self._hidden_dim // self._num_query_heads + + self._query_dense = keras.layers.EinsumDense( + equation="bqm,muh->bquh", + output_shape=(None, self._num_query_heads, self._head_dim), + kernel_initializer=self._kernel_initializer, + dtype=self.compute_dtype, + name="query", + ) + self._query_dense.build(inputs_shape) + + self._key_dense = keras.layers.EinsumDense( + equation="bkm,mvh->bkvh", + output_shape=( + None, + self._num_key_value_heads, + self._head_dim, + ), + kernel_initializer=self._kernel_initializer, + dtype=self.compute_dtype, + name="key", + ) + self._key_dense.build(inputs_shape) + + self._value_dense = keras.layers.EinsumDense( + equation="bkm,mvh->bkvh", + output_shape=( + None, + self._num_key_value_heads, + self._head_dim, + ), + kernel_initializer=self._kernel_initializer, + dtype=self.compute_dtype, + name="value", + ) + self._value_dense.build(inputs_shape) + + self._softmax = keras.layers.Softmax(axis=-1, name="attention_softmax") + + self._dropout_layer = keras.layers.Dropout( + rate=self._dropout, dtype=self.compute_dtype + ) + + self._output_dense = keras.layers.EinsumDense( + equation="bquh,uhm->bqm", + output_shape=(None, self._hidden_dim), + kernel_initializer=self._kernel_initializer, + dtype=self.compute_dtype, + name="attention_output", + ) + self._output_dense.build( + (None, None, self._num_query_heads, self._head_dim) + ) + + self.rotary_embedding_layer = RotaryEmbedding( + max_wavelength=self._rope_max_wavelength, + scaling_factor=self._rope_scaling_factor, + dtype=self.compute_dtype, + ) + + self._dot_product_equation = "bquh,bkuh->buqk" + self._combine_equation = "buqk,bkuh->bquh" + + self.built = True + + def call( + self, + hidden_states, + attention_mask=None, + cache=None, + cache_update_index=None, + training=None, + ): + seq_len = ops.shape(hidden_states)[1] + start_index = ( + cache_update_index if cache_update_index is not None else 0 + ) + # If `cache_update_index` is a tensor, RotaryEmbedding expects it + # to have dtype `self.compute_dtype`. + start_index = ops.cast( + start_index, self.rotary_embedding_layer.compute_dtype + ) + + query = self._query_dense(hidden_states) + + # Note that the original PyTorch implementation uses + # view_as_complex/view_as_real while we use split/concatenate to + # convert to/from complex numbers. The transformations below make + # the rope computation numerically equivalent to the original + # implementation. + def _mistral_rope(x): + x = ops.concatenate([x[..., ::2], x[..., 1::2]], axis=-1) + x = self.rotary_embedding_layer(x, start_index=start_index) + x = ops.reshape( + ops.stack(ops.split(x, 2, axis=-1), axis=-1), ops.shape(x) + ) + return x + + # Compute RoPE for queries + query = _mistral_rope(query) + + def _compute_key_value(x): + key, value = self._key_dense(x), self._value_dense(x) + key = _mistral_rope(key) + return key, value + + if cache is not None: + cache_k = cache[:, 0, ...] + cache_v = cache[:, 1, ...] + + if cache_update_index is not None: + # Compute the new keys and values + key, value = _compute_key_value(hidden_states) + + # Cache is a rotating buffer, we want to warp around if + # the sequence length exceeds the sliding window. + update_end_index = ( + cache_update_index + seq_len - 1 + ) % self._sliding_window + 1 + update_end_index = ops.cast(update_end_index, "int32") + cache_update_index = cache_update_index % self._sliding_window + update_start_index = ops.cond( + update_end_index > cache_update_index, + lambda: ops.cast(cache_update_index, "int32"), + lambda: ops.cast(0, "int32"), + ) + # Also note that the update step below assumes that the + # sequence length is always one when `cache_update_index != 0`. + # This is necessary to support XLA compilation. Ideally, we + # would want to use + # `key[:, -(update_end_index - update_start_index):, ...]` + # as the update but updating using a dynamic slice gives an + # XLA compilation error in TensorFlow. + # Passing a sequence of length > 1 with cache update might give + # incorrect results (since there is no way to determine how + # many most recent tokens are to be saved if the tokens exceed + # the sliding window length). + cache_k = ops.slice_update( + cache_k, + [0, update_start_index, 0, 0], + # We slice the keys and values since if the user has passed + # a sequence of length > `self._sliding_window`. We want to + # prefill the cache using just the most recent values in the + # sliding window. + ops.cast( + key[:, -self._sliding_window :, ...], cache_k.dtype + ), + ) + cache_v = ops.slice_update( + cache_v, + [0, update_start_index, 0, 0], + ops.cast( + value[:, -self._sliding_window :, ...], cache_v.dtype + ), + ) + cache = ops.stack([cache_k, cache_v], axis=1) + + # Get the required keys and values from the cache. + # Since we expect the user to pass a fixed-size cache, we just + # pick the first few slices up-to and including the newly computed + # keys and values. + cache_k = cache_k[:, :update_end_index, ...] + cache_v = cache_v[:, :update_end_index, ...] + + key = ops.cast(cache_k, dtype=self.compute_dtype) + value = ops.cast(cache_v, dtype=self.compute_dtype) + else: + # Compute keys and values + key, value = _compute_key_value(hidden_states) + + # [batch_shape, seq_len, num_key_value_heads, head_dim] + # -> [batch_shape, seq_len, num_heads, head_dim] + key = ops.repeat(key, repeats=self._num_key_value_groups, axis=2) + value = ops.repeat(value, repeats=self._num_key_value_groups, axis=2) + + attention_output = self._compute_attention( + query, key, value, attention_mask + ) + + attention_output = self._dropout_layer( + attention_output, training=training + ) + + attention_output = self._output_dense(attention_output) + + if cache is not None: + return attention_output, cache + return attention_output + + def _masked_softmax(self, attention_scores, attention_mask=None): + if attention_mask is not None: + return self._softmax( + attention_scores, attention_mask[:, None, :, :] + ) + return self._softmax(attention_scores) + + def _compute_attention(self, query, key, value, attention_mask=None): + attention_scores = ops.einsum(self._dot_product_equation, key, query) + + norm_factor = ops.sqrt(ops.cast(self._head_dim, self.compute_dtype)) + + attention_scores = attention_scores / norm_factor + + attention_scores = self._masked_softmax( + attention_scores, attention_mask + ) + attention_output = ops.einsum( + self._combine_equation, attention_scores, value + ) + + return attention_output + + def get_config(self): + config = super().get_config() + config.update( + { + "num_query_heads": self._num_query_heads, + "num_key_value_heads": self._num_key_value_heads, + "rope_max_wavelength": self._rope_max_wavelength, + "rope_scaling_factor": self._rope_scaling_factor, + "kernel_initializer": keras.initializers.serialize( + self._kernel_initializer + ), + "sliding_window": self._sliding_window, + "dropout": self._dropout, + } + ) + return config diff --git a/keras_nlp/models/mistral/mistral_backbone.py b/keras_nlp/models/mistral/mistral_backbone.py new file mode 100644 index 0000000000..fbdebd74b4 --- /dev/null +++ b/keras_nlp/models/mistral/mistral_backbone.py @@ -0,0 +1,195 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from keras_nlp.backend import keras +from keras_nlp.backend import ops +from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding +from keras_nlp.models.backbone import Backbone +from keras_nlp.models.mistral.mistral_layer_norm import ( + MistralLayerNormalization, +) +from keras_nlp.models.mistral.mistral_transformer_decoder import ( + MistralTransformerDecoder, +) + + +def _mistral_kernel_initializer(stddev=0.02): + return keras.initializers.RandomNormal(stddev=stddev) + + +@keras.saving.register_keras_serializable(package="keras_nlp") +class MistralBackbone(Backbone): + """ + The Mistral Transformer core architecture with hyperparameters. + + This network implements a Transformer-based decoder network, + Mistral, as described in + ["Mistral 7B"](https://arxiv.org/pdf/2310.06825.pdf). + It includes the embedding lookups and transformer layers. + + The default constructor gives a fully customizable, randomly initialized + Mistral model with any number of layers, heads, and embedding + dimensions. To load preset architectures and weights, use the `from_preset` + constructor. + + Args: + vocabulary_size (int): The size of the token vocabulary. + num_layers (int): The number of transformer layers. + num_query_heads (int): The number of query attention heads for + each transformer. + hidden_dim (int): The size of the transformer encoding and pooling layers. + intermediate_dim (int): The output dimension of the first Dense layer in a + three-layer feedforward network for each transformer. + num_key_value_heads (int): The number of key and value attention heads for + each transformer. + rope_max_wavelength (int, optional): The maximum angular wavelength of the + sine/cosine curves, for rotary embeddings. Defaults to `10000`. + rope_scaling_factor (float, optional): The scaling factor for calculation + of roatary embedding. Defaults to `1.0`. + layer_norm_epsilon (float, optional): Epsilon for the layer normalization + layers in the transformer decoder. Defaults to `1e-6`. + sliding_window (int, optional): The sliding window for the mistral + attention layers. This controls the maximum cache size for the attention + layers in each transformer decoder. Only `sliding_window` number of tokens + are saved in the cache and used to generate the next token. + Defaults to `512`. + dtype (str, optional): The dtype policy for the mistral model. + + Examples: + + ```python + input_data = { + "token_ids": np.ones(shape=(1, 12), dtype="int32"), + "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]), + } + + # Pretrained Mistral decoder. + model = keras_nlp.models.MistralBackbone.from_preset("mistral7b_base_en") + model(input_data) + + # Randomly initialized Mistral decoder with custom config. + model = keras_nlp.models.MistralBackbone( + vocabulary_size=10, + hidden_dim=512, + num_layers=2, + num_query_heads=32, + num_key_value_heads=8, + intermediate_dim=1024, + sliding_window=512, + layer_norm_epsilon=1e-6, + dtype="float32" + ) + model(input_data) + ``` + """ + + def __init__( + self, + vocabulary_size, + num_layers, + num_query_heads, + hidden_dim, + intermediate_dim, + num_key_value_heads, + rope_max_wavelength=10000, + rope_scaling_factor=1.0, + layer_norm_epsilon=1e-6, + sliding_window=512, + dropout=0, + **kwargs, + ): + # Get the dtype + dtype = kwargs.pop("dtype", keras.backend.floatx()) + + # Inputs + token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids") + padding_mask = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + + # Embed Tokens + token_embedding_layer = ReversibleEmbedding( + input_dim=vocabulary_size, + output_dim=hidden_dim, + tie_weights=False, + embeddings_initializer=_mistral_kernel_initializer(stddev=0.01), + dtype=dtype, + name="token_embedding", + ) + x = token_embedding_layer(token_ids) + + # Apply successive transformer decoder blocks + for i in range(num_layers): + x = MistralTransformerDecoder( + intermediate_dim=intermediate_dim, + num_query_heads=num_query_heads, + num_key_value_heads=num_key_value_heads, + rope_max_wavelength=rope_max_wavelength, + rope_scaling_factor=rope_scaling_factor, + layer_norm_epsilon=layer_norm_epsilon, + activation=ops.silu, + kernel_initializer=_mistral_kernel_initializer(stddev=0.02), + sliding_window=sliding_window, + dropout=dropout, + dtype=dtype, + name=f"transformer_layer_{i}", + )(x, decoder_padding_mask=padding_mask) + + sequence_output = MistralLayerNormalization( + name="sequence_output_layernorm", + epsilon=layer_norm_epsilon, + dtype=dtype, + )(x) + + # Instantiate using Functional API Model constructor + super().__init__( + inputs={ + "token_ids": token_ids, + "padding_mask": padding_mask, + }, + outputs=sequence_output, + **kwargs, + ) + + # All references to `self` below this line + self.vocabulary_size = vocabulary_size + self.num_layers = num_layers + self.num_query_heads = num_query_heads + self.hidden_dim = hidden_dim + self.intermediate_dim = intermediate_dim + self.rope_max_wavelength = rope_max_wavelength + self.num_key_value_heads = num_key_value_heads + self.rope_scaling_factor = rope_scaling_factor + self.sliding_window = sliding_window + self.layer_norm_epsilon = layer_norm_epsilon + self.dropout = dropout + self.token_embedding = token_embedding_layer + + def get_config(self): + config = super().get_config() + config.update( + { + "vocabulary_size": self.vocabulary_size, + "num_layers": self.num_layers, + "num_query_heads": self.num_query_heads, + "hidden_dim": self.hidden_dim, + "intermediate_dim": self.intermediate_dim, + "rope_max_wavelength": self.rope_max_wavelength, + "rope_scaling_factor": self.rope_scaling_factor, + "num_key_value_heads": self.num_key_value_heads, + "sliding_window": self.sliding_window, + "layer_norm_epsilon": self.layer_norm_epsilon, + "dropout": self.dropout, + } + ) + return config diff --git a/keras_nlp/models/mistral/mistral_backbone_test.py b/keras_nlp/models/mistral/mistral_backbone_test.py new file mode 100644 index 0000000000..fc2b0a592b --- /dev/null +++ b/keras_nlp/models/mistral/mistral_backbone_test.py @@ -0,0 +1,56 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from keras_nlp.backend import ops +from keras_nlp.models.mistral.mistral_backbone import MistralBackbone +from keras_nlp.tests.test_case import TestCase + + +class MistralBackboneTest(TestCase): + def setUp(self): + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_query_heads": 8, + "num_key_value_heads": 4, + "hidden_dim": 16, + "intermediate_dim": 8, + "sliding_window": 2, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), + } + + def test_backbone_basics(self): + self.run_backbone_test( + cls=MistralBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 16), + ) + + @pytest.mark.large + def test_saved_model(self): + self.run_model_saving_test( + cls=MistralBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + + def test_num_parameters(self): + model = MistralBackbone(**self.init_kwargs) + # Reference value calculated using the PyTorch model + self.assertEqual(model.count_params(), 2704) diff --git a/keras_nlp/models/mistral/mistral_layer_norm.py b/keras_nlp/models/mistral/mistral_layer_norm.py new file mode 100644 index 0000000000..9f9ddf26b5 --- /dev/null +++ b/keras_nlp/models/mistral/mistral_layer_norm.py @@ -0,0 +1,48 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from keras_nlp.backend import keras +from keras_nlp.backend import ops + + +# TODO: Deprecate this in favor of +# `keras.layers.LayerNormalization(rms_scaling=True)` once Keras 2 support is +# removed. +class MistralLayerNormalization(keras.layers.Layer): + """A normalization layer for Mistral that implements RMS normalization.""" + + def __init__(self, epsilon=1e-6, **kwargs): + super().__init__(**kwargs) + self._epsilon = epsilon + + def build(self, input_shape): + self._dim = input_shape[-1] + self._weight = self.add_weight( + name="weight", + trainable=True, + shape=(self._dim,), + initializer="ones", + dtype=self.compute_dtype, + ) + self.built = True + + def call(self, x): + x = x * ops.rsqrt( + ops.mean(ops.power(x, 2), axis=-1, keepdims=True) + self._epsilon + ) + return x * self._weight + + def get_config(self): + config = super().get_config() + config.update({"epsilon": self._epsilon}) + return config diff --git a/keras_nlp/models/mistral/mistral_preprocessor.py b/keras_nlp/models/mistral/mistral_preprocessor.py new file mode 100644 index 0000000000..e6d54c793a --- /dev/null +++ b/keras_nlp/models/mistral/mistral_preprocessor.py @@ -0,0 +1,175 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from keras_nlp.backend import keras +from keras_nlp.layers.preprocessing.start_end_packer import StartEndPacker +from keras_nlp.models.mistral.mistral_tokenizer import MistralTokenizer +from keras_nlp.models.preprocessor import Preprocessor +from keras_nlp.utils.keras_utils import ( + convert_inputs_to_list_of_tensor_segments, +) +from keras_nlp.utils.keras_utils import pack_x_y_sample_weight +from keras_nlp.utils.python_utils import classproperty + + +@keras.saving.register_keras_serializable(package="keras_nlp") +class MistralPreprocessor(Preprocessor): + """A Mistral preprocessing layer which tokenizes and packs inputs. + + This preprocessing layer will do three things: + + 1. Tokenize any number of input segments using the `tokenizer`. + 2. Pack the inputs together using a `keras_nlp.layers.StartEndPacker`. + with the appropriate tokens. + 3. Construct a dictionary with keys `"token_ids"`, and `"padding_mask"` + that can be passed directly to `keras_nlp.models.MistralBackbone`. + + This layer can be used directly with `tf.data.Dataset.map` to preprocess + string data in the `(x, y, sample_weight)` format used by + `keras.Model.fit`. + + Args: + tokenizer: A `keras_nlp.models.MistralTokenizer` instance. + sequence_length: The length of the packed inputs. + add_start_token: If `True`, the preprocessor will prepend the tokenizer + start token to each input sequence. Default is `True`. + add_end_token: If `True`, the preprocessor will append the tokenizer + end token to each input sequence. Default is `False`. + + Call arguments: + x: A tensor of single string sequences, or a tuple of multiple + tensor sequences to be packed together. Inputs may be batched or + unbatched. For single sequences, raw python inputs will be converted + to tensors. For multiple sequences, pass tensors directly. + y: Any label data. Will be passed through unaltered. + sample_weight: Any label weight data. Will be passed through unaltered. + sequence_length: Pass to override the configured `sequence_length` of + the layer. + + Examples: + + Directly calling the from_preset(). + ```python + preprocessor = keras_nlp.models.MistralPreprocessor.from_preset( + "mistral_base_en" + ) + + # Tokenize and pack a single sentence. + preprocessor("The quick brown fox jumped.") + + # Tokenize and a batch of single sentences. + preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) + + # Preprocess a batch of sentence pairs. + # When handling multiple sequences, always convert to tensors first! + first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) + second = tf.constant(["The fox tripped.", "Oh look, a whale."]) + preprocessor((first, second)) + ``` + + Mapping with `tf.data.Dataset`. + ```python + preprocessor = keras_nlp.models.MistralPreprocessor.from_preset( + "mistral_base_en" + ) + first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) + second = tf.constant(["The fox tripped.", "Oh look, a whale."]) + label = tf.constant([1, 1]) + + # Map labeled single sentences. + ds = tf.data.Dataset.from_tensor_slices((first, label)) + ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) + + # Map unlabeled single sentences. + ds = tf.data.Dataset.from_tensor_slices(first) + ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) + + # Map labeled sentence pairs. + ds = tf.data.Dataset.from_tensor_slices(((first, second), label)) + ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) + + # Map unlabeled sentence pairs. + ds = tf.data.Dataset.from_tensor_slices((first, second)) + + # Watch out for tf.data's default unpacking of tuples here! + # Best to invoke the `preprocessor` directly in this case. + ds = ds.map( + lambda first, second: preprocessor(x=(first, second)), + num_parallel_calls=tf.data.AUTOTUNE, + ) + ``` + """ + + def __init__( + self, + tokenizer, + sequence_length=1024, + add_start_token=True, + add_end_token=False, + **kwargs, + ): + super().__init__(**kwargs) + self.tokenizer = tokenizer + self.add_start_token = add_start_token + self.add_end_token = add_end_token + self.sequence_length = sequence_length + self.packer = StartEndPacker( + start_value=self.tokenizer.start_token_id, + end_value=self.tokenizer.end_token_id, + sequence_length=sequence_length, + return_padding_mask=True, + ) + + def get_config(self): + config = super().get_config() + config.update( + { + "sequence_length": self.sequence_length, + "add_start_token": self.add_start_token, + "add_end_token": self.add_end_token, + } + ) + return config + + def call( + self, + x, + y=None, + sample_weight=None, + sequence_length=None, + ): + x = convert_inputs_to_list_of_tensor_segments(x) + if len(x) != 1: + raise ValueError( + "Mistral requires each input feature to contain only " + f"one segment, but received {len(x)}. If you are using Mistral" + " for a multi-segment classification task, please refer to " + "classification models like BERT or RoBERTa." + ) + sequence_length = sequence_length or self.sequence_length + token_ids, padding_mask = self.packer( + self.tokenizer(x[0]), + sequence_length=sequence_length, + add_start_value=self.add_start_token, + add_end_value=self.add_end_token, + ) + x = { + "token_ids": token_ids, + "padding_mask": padding_mask, + } + return pack_x_y_sample_weight(x, y, sample_weight) + + @classproperty + def tokenizer_cls(cls): + return MistralTokenizer diff --git a/keras_nlp/models/mistral/mistral_preprocessor_test.py b/keras_nlp/models/mistral/mistral_preprocessor_test.py new file mode 100644 index 0000000000..40528fd4e8 --- /dev/null +++ b/keras_nlp/models/mistral/mistral_preprocessor_test.py @@ -0,0 +1,59 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from keras_nlp.models.mistral.mistral_preprocessor import MistralPreprocessor +from keras_nlp.models.mistral.mistral_tokenizer import MistralTokenizer +from keras_nlp.tests.test_case import TestCase + + +class MistralPreprocessorTest(TestCase): + def setUp(self): + self.tokenizer = MistralTokenizer( + # Generated using create_mistral_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "mistral_test_vocab.spm" + ) + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ( + ["the quick brown fox"], + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ) + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=MistralPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[1, 3, 8, 4, 6, 0, 0, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 0, 0, 0]], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ), + ) + + def test_errors_for_2d_list_input(self): + preprocessor = MistralPreprocessor(**self.init_kwargs) + ambiguous_input = [["one", "two"], ["three", "four"]] + with self.assertRaises(ValueError): + preprocessor(ambiguous_input) diff --git a/keras_nlp/models/mistral/mistral_tokenizer.py b/keras_nlp/models/mistral/mistral_tokenizer.py new file mode 100644 index 0000000000..2031d907cc --- /dev/null +++ b/keras_nlp/models/mistral/mistral_tokenizer.py @@ -0,0 +1,79 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from keras_nlp.backend import keras +from keras_nlp.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer + + +@keras.saving.register_keras_serializable(package="keras_nlp") +class MistralTokenizer(SentencePieceTokenizer): + """Mistral tokenizer layer based on SentencePiece. + + This tokenizer class will tokenize raw strings into integer sequences and + is based on `keras_nlp.tokenizers.SentencePieceTokenizer`. Unlike the + underlying tokenizer, it will check for all special tokens needed by + Mistral models and provides a `from_preset()` method to automatically + download a matching vocabulary for a Mistral preset. + + This tokenizer does not provide truncation or padding of inputs. It can be + combined with a `keras_nlp.models.MistralPreprocessor` layer for input + packing. + + If input is a batch of strings (rank > 0), the layer will output a + `tf.RaggedTensor` where the last dimension of the output is ragged. + + If input is a scalar string (rank == 0), the layer will output a dense + `tf.Tensor` with static shape `[None]`. + + Args: + proto: Either a `string` path to a SentencePiece proto file, or a + `bytes` object with a serialized SentencePiece proto. See the + [SentencePiece repository](https://github.com/google/sentencepiece) + for more details on the format. + + Examples: + ```python + # Unbatched input. + tokenizer = keras_nlp.models.MistralTokenizer.from_preset( + "mistral_base_en", + ) + tokenizer("The quick brown fox jumped.") + + # Batched input. + tokenizer(["The quick brown fox jumped.", "The fox slept."]) + + # Detokenization. + tokenizer.detokenize(tokenizer("The quick brown fox jumped.")) + ``` + """ + + def __init__(self, proto, **kwargs): + self.start_token = "" + self.end_token = "" + super().__init__(proto=proto, **kwargs) + + def set_proto(self, proto): + super().set_proto(proto) + if proto is not None: + for token in [self.start_token, self.end_token]: + if token not in self.get_vocabulary(): + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + self.start_token_id = self.token_to_id(self.start_token) + self.end_token_id = self.token_to_id(self.end_token) + else: + self.start_token_id = None + self.end_token_id = None diff --git a/keras_nlp/models/mistral/mistral_tokenizer_test.py b/keras_nlp/models/mistral/mistral_tokenizer_test.py new file mode 100644 index 0000000000..ea9e04f67d --- /dev/null +++ b/keras_nlp/models/mistral/mistral_tokenizer_test.py @@ -0,0 +1,46 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from keras_nlp.models.mistral.mistral_tokenizer import MistralTokenizer +from keras_nlp.tests.test_case import TestCase + + +class MistralTokenizerTest(TestCase): + def setUp(self): + self.init_kwargs = { + # Generated using create_mistral_test_proto.py + "proto": os.path.join( + self.get_test_data_dir(), "mistral_test_vocab.spm" + ) + } + self.input_data = ["the quick brown fox", "the earth is round"] + + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=MistralTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[3, 8, 4, 6], [3, 5, 7, 9]], + ) + + def test_errors_missing_special_tokens(self): + with self.assertRaises(ValueError): + MistralTokenizer( + # Generated using create_no_special_token_proto.py + proto=os.path.join( + self.get_test_data_dir(), "no_special_token_vocab.spm" + ) + ) diff --git a/keras_nlp/models/mistral/mistral_transformer_decoder.py b/keras_nlp/models/mistral/mistral_transformer_decoder.py new file mode 100644 index 0000000000..9b6f7fdbf8 --- /dev/null +++ b/keras_nlp/models/mistral/mistral_transformer_decoder.py @@ -0,0 +1,233 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from keras_nlp.backend import keras +from keras_nlp.backend import ops +from keras_nlp.layers.modeling.transformer_layer_utils import ( + compute_causal_mask, +) +from keras_nlp.layers.modeling.transformer_layer_utils import ( + merge_padding_and_attention_mask, +) +from keras_nlp.models.mistral.mistral_attention import CachedMistralAttention +from keras_nlp.models.mistral.mistral_layer_norm import ( + MistralLayerNormalization, +) +from keras_nlp.utils.keras_utils import clone_initializer + + +class MistralTransformerDecoder(keras.layers.Layer): + """A Transformer decoder layer for the Mistral backbone.""" + + def __init__( + self, + intermediate_dim, + num_query_heads, + num_key_value_heads, + rope_max_wavelength=10000, + rope_scaling_factor=1.0, + activation="relu", + layer_norm_epsilon=1e-5, + kernel_initializer="glorot_uniform", + sliding_window=512, + dropout=0, + **kwargs, + ): + super().__init__(**kwargs) + self.intermediate_dim = intermediate_dim + self.num_query_heads = num_query_heads + self.num_key_value_heads = num_key_value_heads + + self.rope_max_wavelength = rope_max_wavelength + self.rope_scaling_factor = rope_scaling_factor + + self.dropout = dropout + + self.sliding_window = sliding_window + self.activation = keras.activations.get(activation) + self.layer_norm_epsilon = layer_norm_epsilon + self.kernel_initializer = keras.initializers.get(kernel_initializer) + + self.supports_masking = True + + def build(self, decoder_sequence_shape): + self._decoder_sequence_shape = decoder_sequence_shape + self.hidden_dim = decoder_sequence_shape[-1] + + # Self attention layer. + self._self_attention_layer = CachedMistralAttention( + num_query_heads=self.num_query_heads, + num_key_value_heads=self.num_key_value_heads, + rope_max_wavelength=self.rope_max_wavelength, + rope_scaling_factor=self.rope_scaling_factor, + sliding_window=self.sliding_window, + kernel_initializer=clone_initializer(self.kernel_initializer), + dropout=self.dropout, + dtype=self.compute_dtype, + name="self_attention", + ) + self._self_attention_layer.build(decoder_sequence_shape) + + self._self_attention_layernorm = MistralLayerNormalization( + epsilon=self.layer_norm_epsilon, + name="self_attention_layernorm", + dtype=self.compute_dtype, + ) + self._self_attention_layernorm.build(decoder_sequence_shape) + self._self_attention_dropout = keras.layers.Dropout( + rate=self.dropout, + dtype=self.compute_dtype, + name="self_attention_dropout", + ) + + # Feedforward layers. + self._feedforward_intermediate_dense = keras.layers.Dense( + self.intermediate_dim, + kernel_initializer=clone_initializer(self.kernel_initializer), + use_bias=False, + dtype=self.compute_dtype, + name="feedforward_intermediate_dense", + ) + self._feedforward_intermediate_dense.build(decoder_sequence_shape) + + self._feedforward_gate_dense = keras.layers.Dense( + self.intermediate_dim, + activation=self.activation, + kernel_initializer=clone_initializer(self.kernel_initializer), + use_bias=False, + name="feedforward_gate_dense", + ) + self._feedforward_gate_dense.build(decoder_sequence_shape) + + self._feedforward_output_dense = keras.layers.Dense( + self.hidden_dim, + kernel_initializer=clone_initializer(self.kernel_initializer), + use_bias=False, + dtype=self.compute_dtype, + name="feedforward_output_dense", + ) + + self._feedforward_output_dense.build( + self._feedforward_gate_dense.compute_output_shape( + decoder_sequence_shape + ) + ) + + self._feedforward_layernorm = MistralLayerNormalization( + epsilon=self.layer_norm_epsilon, + name="feedforward_layernorm", + dtype=self.compute_dtype, + ) + self._feedforward_layernorm.build(decoder_sequence_shape) + + self.built = True + + def call( + self, + decoder_sequence, + decoder_padding_mask=None, + decoder_attention_mask=None, + self_attention_cache=None, + self_attention_cache_update_index=None, + training=None, + ): + self_attention_mask = self._compute_self_attention_mask( + decoder_sequence=decoder_sequence, + decoder_padding_mask=decoder_padding_mask, + decoder_attention_mask=decoder_attention_mask, + ) + residual = decoder_sequence + + x = self._self_attention_layernorm(decoder_sequence) + + # Self attention block. + x = self._self_attention_layer( + hidden_states=x, + attention_mask=self_attention_mask, + cache=self_attention_cache, + cache_update_index=self_attention_cache_update_index, + ) + + if self_attention_cache is not None: + x, self_attention_cache = x + + x = self._self_attention_dropout(x, training=training) + + x = x + residual + residual = x + + x = self._feedforward_layernorm(x) + gate_output = self._feedforward_gate_dense(x) + + x = self._feedforward_intermediate_dense(x) + + x = self._feedforward_output_dense(ops.multiply(x, gate_output)) + + decoder_output = x + residual + + if self_attention_cache is not None: + return decoder_output, self_attention_cache + return decoder_output + + def _compute_self_attention_mask( + self, + decoder_sequence, + decoder_padding_mask, + decoder_attention_mask, + ): + decoder_mask = merge_padding_and_attention_mask( + decoder_sequence, decoder_padding_mask, decoder_attention_mask + ) + batch_size = ops.shape(decoder_sequence)[0] + input_length = output_length = ops.shape(decoder_sequence)[1] + + # Mistral uses a banded attention mask + causal_mask_lower = compute_causal_mask( + batch_size, input_length, output_length, 0 + ) + # Below is a workaround for `ops.triu` for Keras 2. + # TODO(tirthasheshpatel): Use `ops.triu` once Keras 2 support is removed. + # causal_mask = ops.triu(causal_mask_lower, k=-self.sliding_window) + i = ops.arange(output_length)[:, None] + j = ops.arange(input_length)[None, :] + causal_mask_upper = ops.cast(i <= j + self.sliding_window, "int32") + causal_mask = ops.minimum(causal_mask_lower, causal_mask_upper) + + return ( + ops.minimum(decoder_mask, causal_mask) + if decoder_mask is not None + else causal_mask + ) + + def compute_output_shape(self, decoder_sequence_shape): + return decoder_sequence_shape + + def get_config(self): + config = super().get_config() + config.update( + { + "intermediate_dim": self.intermediate_dim, + "num_query_heads": self.num_query_heads, + "rope_max_wavelength": self.rope_max_wavelength, + "rope_scaling_factor": self.rope_scaling_factor, + "num_key_value_heads": self.num_key_value_heads, + "sliding_window": self.sliding_window, + "activation": keras.activations.serialize(self.activation), + "layer_norm_epsilon": self.layer_norm_epsilon, + "kernel_initializer": keras.initializers.serialize( + self.kernel_initializer + ), + "dropout": self.dropout, + } + ) + return config diff --git a/keras_nlp/models/opt/opt_backbone.py b/keras_nlp/models/opt/opt_backbone.py index 8fe37472a0..ff1495ba9f 100644 --- a/keras_nlp/models/opt/opt_backbone.py +++ b/keras_nlp/models/opt/opt_backbone.py @@ -14,10 +14,6 @@ import copy -from tensorflow.experimental import dtensor -from tensorflow.experimental.dtensor import Layout -from tensorflow.keras.dtensor.experimental import LayoutMap - from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.layers.modeling.token_and_position_embedding import ( @@ -168,71 +164,3 @@ def get_config(self): @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - def create_layout_map(cls, mesh): - """Create a DTensor layout map for an OPTBackbone. - - Given a DTensor mesh describing a list of devices, this method returns a - DTensor layout map for creating a `keras_nlp.models.OPTBackbone` - instance. This mapping describes how to distribute all model weights - across multiple devices. For an overview of DTensor concepts, see - [this guide](https://www.tensorflow.org/guide/dtensor_overview). - - Args: - mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement - of devices for running distributed computation. The - first dimension in the mesh is expected to be for data parallel - distribution, and the second for model parallel distribution. - - Returns: - A `tf.keras.dtensor.experimental.LayoutMap` which contains the - proper layout to weights mapping for the model parallel setting. - - Examples: - ```python - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) - - # Update both dimensions below for a multi-device setting. - mesh = dtensor.create_mesh([("batch", 1), ("model", 1)]) - layout_map = keras_nlp.models.OPTBackbone.create_layout_map(mesh) - - with layout_map.scope(): - model = keras_nlp.models.OPTBackbone.from_preset("opt_125m_en") - ``` - """ - # We assert the mesh is 2D, and assume the first mesh dim is for data - # parallel and the second dim is for model parallel. - mesh_shape = mesh.shape() - if len(mesh_shape) != 2: - raise ValueError( - f"Expect to create layout based on 2D mesh, received {mesh}" - ) - _, model_dim = mesh.dim_names - unshard_dim = dtensor.UNSHARDED - - layout_map = LayoutMap(mesh=mesh) - # Embedding sharding - layout_map[r".*embeddings"] = Layout([unshard_dim, model_dim], mesh) - - # Transformer block sharding - layout_map[r".*_(query|key|value)_dense.kernel"] = Layout( - [unshard_dim, unshard_dim, model_dim], mesh - ) - layout_map[r".*_(query|key|value)_dense.bias"] = Layout( - [model_dim, unshard_dim], mesh - ) - layout_map[r".*_feedforward_intermediate_dense.kernel"] = Layout( - [unshard_dim, model_dim], mesh - ) - layout_map[r".*_feedforward_intermediate_dense.bias"] = Layout( - [model_dim], mesh - ) - layout_map[r".*_feedforward_output_dense.kernel"] = Layout( - [model_dim, unshard_dim], mesh - ) - layout_map[r".*_feedforward_output_dense.bias"] = Layout( - [unshard_dim], mesh - ) - return layout_map diff --git a/keras_nlp/models/opt/opt_backbone_test.py b/keras_nlp/models/opt/opt_backbone_test.py index 012c99c8a6..445bdaebad 100644 --- a/keras_nlp/models/opt/opt_backbone_test.py +++ b/keras_nlp/models/opt/opt_backbone_test.py @@ -12,118 +12,65 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.models.opt.opt_backbone import OPTBackbone from keras_nlp.tests.test_case import TestCase class OPTBackboneTest(TestCase): def setUp(self): - self.backbone = OPTBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_opt(self): - self.backbone(self.input_batch) - - def test_token_embedding(self): - output = self.backbone.token_embedding(self.input_batch["token_ids"]) - self.assertEqual(output.shape, (2, 5, 2)) - - def test_name(self): - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "opt_backbone") - - def test_variable_sequence_length_call_opt(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "padding_mask": np.ones((2, seq_length), dtype="int32"), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=OPTBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 2), ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) - @pytest.mark.large # Saving is slow, so mark these large. + @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, OPTBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose(model_output, restored_output) - - def test_create_layout_map(self): - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - with OPTBackbone.create_layout_map(mesh).scope(): - OPTBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - # Using DTensor enables the mlir bridge as a side effect. Eventually - # this will be default, but for now we have compile errors with the - # bridge elsewhere and must disable. See - # https://github.com/keras-team/keras-nlp/issues/1001 - tf.config.experimental.disable_mlir_bridge() + self.run_model_saving_test( + cls=OPTBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=OPTBackbone, + preset="opt_125m_en", + input_data={ + "token_ids": ops.array([[133, 2119, 6219, 23602, 4]]), + "padding_mask": ops.ones((1, 5), dtype="int32"), + }, + expected_output_shape=(1, 5, 768), + # The forward pass from a preset should be stable! + expected_partial_output=ops.array( + [-0.246, -1.004, -0.072, 0.097, 0.533] + ), + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class OPTBackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = OPTBackbone( - vocabulary_size=1000, - num_layers=2, - num_heads=2, - hidden_dim=32, - intermediate_dim=128, - max_sequence_length=128, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in OPTBackbone.presets: + self.run_preset_test( + cls=OPTBackbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "token_ids": np.ones((8, 128), dtype="int32"), - "padding_mask": np.ones((8, 128), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) diff --git a/keras_nlp/models/opt/opt_causal_lm.py b/keras_nlp/models/opt/opt_causal_lm.py index f0b0682749..6197a87ffd 100644 --- a/keras_nlp/models/opt/opt_causal_lm.py +++ b/keras_nlp/models/opt/opt_causal_lm.py @@ -321,39 +321,3 @@ def next(prompt, cache, index): "token_ids": token_ids, "padding_mask": padding_mask, } - - @classmethod - def create_layout_map(cls, mesh): - """Create a DTensor layout map for an OPTCausalLM. - - Given a DTensor mesh describing a list of devices, this method returns a - DTensor layout map for creating a `keras_nlp.models.OPTCausalLM` - instance. This mapping describes how to distribute all model weights - across multiple devices. For an overview of DTensor concepts, see - [this guide](https://www.tensorflow.org/guide/dtensor_overview). - - Args: - mesh: A 2D `tf.experimental.dtensor.Mesh` describing the arrangement - of devices for running distributed computation. The - first dimension in the mesh is expected to be for data parallel - distribution, and the second for model parallel distribution. - - Returns: - A `tf.keras.dtensor.experimental.LayoutMap` which contains the - proper layout to weights mapping for the model parallel setting. - - Examples: - ```python - keras.backend.experimental.enable_tf_random_generator() - keras.utils.set_random_seed(1337) - - # Update both dimensions below for a multi-device setting. - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - layout_map = keras_nlp.models.OPTCausalLM.create_layout_map(mesh) - - with layout_map.scope(): - opt_lm = keras_nlp.models.OPTCausalLM.from_preset("opt_125m_en") - ``` - """ - # As this task has no new variables, we just re-use the backbone method. - return cls.backbone_cls.create_layout_map(mesh) diff --git a/keras_nlp/models/opt/opt_causal_lm_preprocessor.py b/keras_nlp/models/opt/opt_causal_lm_preprocessor.py index 26f01a32d1..1895854e41 100644 --- a/keras_nlp/models/opt/opt_causal_lm_preprocessor.py +++ b/keras_nlp/models/opt/opt_causal_lm_preprocessor.py @@ -143,6 +143,9 @@ def generate_preprocess( the sequence (as generation is expected to continue at the end of the inputted prompt). """ + if not self.built: + self.build(None) + x = convert_inputs_to_list_of_tensor_segments(x)[0] x = self.tokenizer(x) token_ids, padding_mask = self.packer( @@ -163,11 +166,12 @@ def generate_postprocess( padding and start/end tokens, and then converting the integer sequence back to a string. """ + if not self.built: + self.build(None) + token_ids, padding_mask = x["token_ids"], x["padding_mask"] - if not isinstance(token_ids, tf.Tensor): - token_ids = ops.convert_to_numpy(token_ids) - if not isinstance(padding_mask, tf.Tensor): - padding_mask = ops.convert_to_numpy(padding_mask) + token_ids = ops.convert_to_numpy(token_ids) + padding_mask = ops.convert_to_numpy(padding_mask) # Strip any special tokens during detokenization (e.g. the start and # end markers). In the future we could make this configurable. padding_mask = padding_mask & (token_ids != self.tokenizer.end_token_id) diff --git a/keras_nlp/models/opt/opt_causal_lm_preprocessor_test.py b/keras_nlp/models/opt/opt_causal_lm_preprocessor_test.py index eb54a94196..9ba6851d4b 100644 --- a/keras_nlp/models/opt/opt_causal_lm_preprocessor_test.py +++ b/keras_nlp/models/opt/opt_causal_lm_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.opt.opt_causal_lm_preprocessor import ( OPTCausalLMPreprocessor, ) @@ -24,104 +23,71 @@ class OPTCausalLMPreprocessorTest(TestCase): def setUp(self): - self.vocab = { - "": 0, - "": 1, - "air": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, + self.vocab = ["", "", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.tokenizer = OPTTokenizer( + vocabulary=self.vocab, + merges=self.merges, + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - self.merges = merges - - self.preprocessor = OPTCausalLMPreprocessor( - tokenizer=OPTTokenizer( - vocabulary=self.vocab, - merges=self.merges, + self.input_data = ["airplane at airport"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=OPTCausalLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[1, 2, 4, 5, 3, 6, 1, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, + [[2, 4, 5, 3, 6, 1, 0, 0]], # Pass through labels. + [[1, 1, 1, 1, 1, 1, 0, 0]], # Pass through sample_weights. ), - sequence_length=8, ) - def test_strings(self): - input_data = " airplane at airport" - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [1, 3, 4, 5, 3, 6, 1, 0]) - self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - self.assertAllEqual(y, [3, 4, 5, 3, 6, 1, 0, 0]) - self.assertAllEqual(sw, [1, 1, 1, 1, 1, 1, 0, 0]) - - def test_list_of_strings(self): - input_data = [" airplane at airport"] * 4 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[1, 3, 4, 5, 3, 6, 1, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, [[3, 4, 5, 3, 6, 1, 0, 0]] * 4) - self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4) - def test_no_start_end_token(self): - input_data = [" airplane at airport"] * 4 + input_data = ["airplane at airport"] * 4 preprocessor = OPTCausalLMPreprocessor( - tokenizer=OPTTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, + **self.init_kwargs, add_start_token=False, add_end_token=False, ) x, y, sw = preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[3, 4, 5, 3, 6, 0, 0, 0]] * 4) + self.assertAllEqual(x["token_ids"], [[2, 4, 5, 3, 6, 0, 0, 0]] * 4) self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4) self.assertAllEqual(y, [[4, 5, 3, 6, 0, 0, 0, 0]] * 4) self.assertAllEqual(sw, [[1, 1, 1, 1, 0, 0, 0, 0]] * 4) - def test_labeled_batch(self): - x = tf.constant([" airplane at airport"] * 4) - y = tf.constant([1] * 4) # Ignored. - sw = tf.constant([1.0] * 4) # Ignored. - x, y, sw = self.preprocessor(x, y, sw) - self.assertAllEqual(x["token_ids"], [[1, 3, 4, 5, 3, 6, 1, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, [[3, 4, 5, 3, 6, 1, 0, 0]] * 4) - self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4) - - def test_dataset(self): - x = tf.constant([" airplane at airport"] * 4) - ds = tf.data.Dataset.from_tensor_slices(x) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(4).take(1).get_single_element() - self.assertAllEqual(x["token_ids"], [[1, 3, 4, 5, 3, 6, 1, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, [[3, 4, 5, 3, 6, 1, 0, 0]] * 4) - self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4) - def test_generate_preprocess(self): - input_data = " airplane at airport" - x = self.preprocessor.generate_preprocess(input_data) - self.assertAllEqual(x["token_ids"], [1, 3, 4, 5, 3, 6, 0, 0]) + input_data = "airplane at airport" + preprocessor = OPTCausalLMPreprocessor(**self.init_kwargs) + x = preprocessor.generate_preprocess(input_data) + self.assertAllEqual(x["token_ids"], [1, 2, 4, 5, 3, 6, 0, 0]) self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0]) def test_generate_postprocess(self): input_data = { - "token_ids": tf.constant([1, 3, 4, 5, 3, 6, 0, 0]), - "padding_mask": tf.cast([1, 1, 1, 1, 1, 1, 0, 0], dtype="bool"), + "token_ids": [1, 2, 4, 5, 3, 6, 0, 0], + "padding_mask": [1, 1, 1, 1, 1, 1, 0, 0], } - x = self.preprocessor.generate_postprocess(input_data) - self.assertAllEqual(x, " airplane at airport") - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + preprocessor = OPTCausalLMPreprocessor(**self.init_kwargs) + x = preprocessor.generate_postprocess(input_data) + self.assertAllEqual(x, "airplane at airport") + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in OPTCausalLMPreprocessor.presets: + self.run_preset_test( + cls=OPTCausalLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/opt/opt_causal_lm_test.py b/keras_nlp/models/opt/opt_causal_lm_test.py index 19325c7b0a..3ba27178d1 100644 --- a/keras_nlp/models/opt/opt_causal_lm_test.py +++ b/keras_nlp/models/opt/opt_causal_lm_test.py @@ -12,13 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from unittest.mock import patch import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.opt.opt_backbone import OPTBackbone from keras_nlp.models.opt.opt_causal_lm import OPTCausalLM @@ -31,24 +28,11 @@ class OPTCausalLMTest(TestCase): def setUp(self): - self.vocab = { - "": 0, - "": 1, - "Ġair": 2, - "plane": 3, - "Ġat": 4, - "port": 5, - "Ġkoh": 6, - "li": 7, - "Ġis": 8, - "Ġthe": 9, - "Ġbest": 10, - } - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - self.merges = merges + self.vocab = ["", "", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] self.preprocessor = OPTCausalLMPreprocessor( OPTTokenizer(vocabulary=self.vocab, merges=self.merges), sequence_length=8, @@ -59,68 +43,46 @@ def setUp(self): num_heads=2, hidden_dim=4, intermediate_dim=8, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.causal_lm = OPTCausalLM( - backbone=self.backbone, - preprocessor=self.preprocessor, + max_sequence_length=self.preprocessor.sequence_length, ) - - self.raw_batch = [ - " airplane at airport", - " airplane at airport", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch)[0] - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_causal_lm(self): - self.causal_lm(self.preprocessed_batch) - - def test_predict(self): - self.causal_lm.predict(self.raw_batch) - self.causal_lm.preprocessor = None - self.causal_lm.predict(self.preprocessed_batch) - - def test_fit(self): - self.causal_lm.fit(self.raw_dataset) - self.causal_lm.preprocessor = None - self.causal_lm.fit(self.preprocessed_dataset) - - def test_fit_no_xla(self): - self.causal_lm.preprocessor = None - self.causal_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ([" airplane at airport", " airplane at airport"],) + self.input_data = self.preprocessor(*self.train_data)[0] + + def test_causal_lm_basics(self): + self.run_task_test( + cls=OPTCausalLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 8, 7), ) - self.causal_lm.fit(self.preprocessed_dataset) def test_generate(self): + causal_lm = OPTCausalLM(**self.init_kwargs) # String input. prompt = " airplane at airport" - output = self.causal_lm.generate(" airplane at airport") + output = causal_lm.generate(" airplane at airport") self.assertTrue(prompt in output) - # String tensor input. - self.assertIsInstance(self.causal_lm.generate(self.raw_batch)[0], str) - # String dataset input. - self.assertIsInstance(self.causal_lm.generate(self.raw_dataset)[0], str) # Int tensor input. - self.causal_lm.preprocessor = None - outputs = self.causal_lm.generate(self.preprocessed_batch) + prompt_ids = self.preprocessor.generate_preprocess([prompt]) + causal_lm.preprocessor = None + outputs = causal_lm.generate(prompt_ids) # Assert prompt is in output in token id space. self.assertAllEqual( outputs["token_ids"][:, :5], - self.preprocessed_batch["token_ids"][:, :5], + prompt_ids["token_ids"][:, :5], ) self.assertAllEqual( outputs["padding_mask"][:, :5], - self.preprocessed_batch["padding_mask"][:, :5], + prompt_ids["padding_mask"][:, :5], ) def test_early_stopping(self): - call_with_cache = self.causal_lm.call_with_cache + causal_lm = OPTCausalLM(**self.init_kwargs) + call_with_cache = causal_lm.call_with_cache def wrapper(*args, **kwargs): """Modify output logits to always favor end_token_id""" @@ -131,53 +93,37 @@ def wrapper(*args, **kwargs): logits = ops.slice_update(logits, (0, 0, index), update) return logits, hidden_states, cache - with patch.object(self.causal_lm, "call_with_cache", wraps=wrapper): + with patch.object(causal_lm, "call_with_cache", wraps=wrapper): prompt = [" airplane at airport", " airplane"] - output = self.causal_lm.generate(prompt) + output = causal_lm.generate(prompt) # We should immediately abort and output the prompt. self.assertEqual(prompt, output) def test_generate_compilation(self): + causal_lm = OPTCausalLM(**self.init_kwargs) # Assert we do not recompile with successive calls. - self.causal_lm.generate(self.raw_batch) - first_fn = self.causal_lm.generate_function - self.causal_lm.generate(self.raw_batch) - second_fn = self.causal_lm.generate_function + causal_lm.generate(" airplane at airport") + first_fn = causal_lm.generate_function + causal_lm.generate(" airplane at airport") + second_fn = causal_lm.generate_function self.assertEqual(first_fn, second_fn) # Assert we do recompile after compile is called. - self.causal_lm.compile(sampler="greedy") - self.assertIsNone(self.causal_lm.generate_function) - - def test_serialization(self): - new_causal_lm = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.causal_lm) - ) - self.assertEqual( - new_causal_lm.get_config(), self.causal_lm.get_config() - ) + causal_lm.compile(sampler="greedy") + self.assertIsNone(causal_lm.generate_function) @pytest.mark.large def test_saved_model(self): - keras.utils.set_random_seed(42) - model_output = self.causal_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.causal_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, OPTCausalLM) - - # Check that output matches. - keras.utils.set_random_seed(42) - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + self.run_model_saving_test( + cls=OPTCausalLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - def test_create_layout_map(self): - mesh = tf.experimental.dtensor.create_mesh([("batch", 1), ("model", 1)]) - with OPTCausalLM.create_layout_map(mesh).scope(): - OPTCausalLM(backbone=self.backbone) - # Using DTensor enables the mlir bridge as a side effect. Eventually - # this will be default, but for now we have compile errors with the - # bridge elsewhere and must disable. See - # https://github.com/keras-team/keras-nlp/issues/1001 - tf.config.experimental.disable_mlir_bridge() + @pytest.mark.extra_large + def test_all_presets(self): + for preset in OPTCausalLM.presets: + self.run_preset_test( + cls=OPTCausalLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/opt/opt_preprocessor.py b/keras_nlp/models/opt/opt_preprocessor.py index 6a6b5537bf..cdca904870 100644 --- a/keras_nlp/models/opt/opt_preprocessor.py +++ b/keras_nlp/models/opt/opt_preprocessor.py @@ -123,13 +123,19 @@ def __init__( self.sequence_length = sequence_length self.add_start_token = add_start_token self.add_end_token = add_end_token + self.packer = None + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = StartEndPacker( - start_value=tokenizer.start_token_id, - end_value=tokenizer.end_token_id, - pad_value=tokenizer.pad_token_id, - sequence_length=sequence_length, + start_value=self.tokenizer.start_token_id, + end_value=self.tokenizer.end_token_id, + pad_value=self.tokenizer.pad_token_id, + sequence_length=self.sequence_length, return_padding_mask=True, ) + self.built = True def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/opt/opt_preprocessor_test.py b/keras_nlp/models/opt/opt_preprocessor_test.py index ae68cf8089..b80c409b92 100644 --- a/keras_nlp/models/opt/opt_preprocessor_test.py +++ b/keras_nlp/models/opt/opt_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.opt.opt_preprocessor import OPTPreprocessor from keras_nlp.models.opt.opt_tokenizer import OPTTokenizer from keras_nlp.tests.test_case import TestCase @@ -22,46 +21,34 @@ class OPTPreprocessorTest(TestCase): def setUp(self): - self.vocab = { - "": 0, - "": 1, - "air": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, + self.vocab = ["", "", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.tokenizer = OPTTokenizer( + vocabulary=self.vocab, + merges=self.merges, + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - self.merges = merges - - self.preprocessor = OPTPreprocessor( - tokenizer=OPTTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, + self.input_data = ["airplane at airport"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=OPTPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output={ + "token_ids": [[1, 2, 4, 5, 3, 6, 1, 0]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, ) - def test_tokenize_strings(self): - input_data = " airplane at airport" - - x = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [1, 3, 4, 5, 3, 6, 1, 0]) - self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0]) - - def test_tokenize_list_of_strings(self): - input_data = [" airplane at airport"] * 4 - - x = self.preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[1, 3, 4, 5, 3, 6, 1, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - def test_no_start_end_token(self): - input_data = [" airplane at airport"] * 4 + input_data = ["airplane at airport"] * 4 preprocessor = OPTPreprocessor( tokenizer=OPTTokenizer( @@ -73,36 +60,20 @@ def test_no_start_end_token(self): add_end_token=False, ) x = preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[3, 4, 5, 3, 6, 0, 0, 0]] * 4) + self.assertAllEqual(x["token_ids"], [[2, 4, 5, 3, 6, 0, 0, 0]] * 4) self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4) - def test_tokenize_labeled_batch(self): - x = tf.constant([" airplane at airport"] * 4) - y_in = tf.constant([1] * 4) - sw_in = tf.constant([1.0] * 4) - x, y, sw = self.preprocessor(x, y_in, sw_in) - self.assertAllEqual(x["token_ids"], [[1, 3, 4, 5, 3, 6, 1, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - self.assertAllEqual(y, y_in) - self.assertAllEqual(sw, sw_in) - - def test_tokenize_labeled_dataset(self): - x = tf.constant([" airplane at airport"] * 4) - ds = tf.data.Dataset.from_tensor_slices(x) - ds = ds.map(self.preprocessor) - x = ds.batch(4).take(1).get_single_element() - self.assertAllEqual(x["token_ids"], [[1, 3, 4, 5, 3, 6, 1, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - def test_sequence_length_override(self): - input_data = " airplane at airport" - x = self.preprocessor(input_data, sequence_length=4) - self.assertAllEqual(x["token_ids"], [1, 3, 4, 1]) - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + input_data = "airplane at airport" + preprocessor = OPTPreprocessor(**self.init_kwargs) + x = preprocessor(input_data, sequence_length=4) + self.assertAllEqual(x["token_ids"], [1, 2, 4, 1]) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in OPTPreprocessor.presets: + self.run_preset_test( + cls=OPTPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/opt/opt_presets.py b/keras_nlp/models/opt/opt_presets.py index 7af2641138..50091be243 100644 --- a/keras_nlp/models/opt/opt_presets.py +++ b/keras_nlp/models/opt/opt_presets.py @@ -26,22 +26,7 @@ "path": "opt", "model_card": "https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/model_card.md", }, - "config": { - "vocabulary_size": 50272, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 2048, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/opt_125m_en/v1/model.h5", - "weights_hash": "63e444998982e48da4a1a3970f4c6203", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/opt_125m_en/v1/vocab.json", - "vocabulary_hash": "cf410ee085c5c69c957bb1f6d8456596", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/opt_125m_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/opt/keras/opt_125m_en/2", }, # We skip the 350m checkpoint because it does not match the structure of # other checkpoints. @@ -56,22 +41,7 @@ "path": "opt", "model_card": "https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/model_card.md", }, - "config": { - "vocabulary_size": 50272, - "num_layers": 24, - "num_heads": 32, - "hidden_dim": 2048, - "intermediate_dim": 8192, - "dropout": 0.1, - "max_sequence_length": 2048, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/opt_1.3b_en/v1/model.h5", - "weights_hash": "0365ac8483e99a912c9770521909ecce", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/opt_1.3b_en/v1/vocab.json", - "vocabulary_hash": "cf410ee085c5c69c957bb1f6d8456596", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/opt_1.3b_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/opt/keras/opt_1.3b_en/2", }, "opt_2.7b_en": { "metadata": { @@ -84,22 +54,7 @@ "path": "opt", "model_card": "https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/model_card.md", }, - "config": { - "vocabulary_size": 50272, - "num_layers": 32, - "num_heads": 32, - "hidden_dim": 2560, - "intermediate_dim": 10240, - "dropout": 0.1, - "max_sequence_length": 2048, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/opt_2.7b_en/v1/model.h5", - "weights_hash": "af56da9206a95b9287356955c5bc14e7", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/opt_2.7b_en/v1/vocab.json", - "vocabulary_hash": "cf410ee085c5c69c957bb1f6d8456596", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/opt_2.7b_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/opt/keras/opt_2.7b_en/2", }, "opt_6.7b_en": { "metadata": { @@ -112,21 +67,6 @@ "path": "opt", "model_card": "https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/model_card.md", }, - "config": { - "vocabulary_size": 50272, - "num_layers": 32, - "num_heads": 32, - "hidden_dim": 4096, - "intermediate_dim": 16384, - "dropout": 0.1, - "max_sequence_length": 2048, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/opt_6.7b_en/v1/model.h5", - "weights_hash": "543120fbe601b70e6ec04cc909781e21", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/opt_6.7b_en/v1/vocab.json", - "vocabulary_hash": "cf410ee085c5c69c957bb1f6d8456596", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/opt_6.7b_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/opt/keras/opt_6.7b_en/2", }, } diff --git a/keras_nlp/models/opt/opt_presets_test.py b/keras_nlp/models/opt/opt_presets_test.py deleted file mode 100644 index a9426d29e7..0000000000 --- a/keras_nlp/models/opt/opt_presets_test.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from absl.testing import parameterized - -from keras_nlp.backend import ops -from keras_nlp.models.opt.opt_backbone import OPTBackbone -from keras_nlp.models.opt.opt_tokenizer import OPTTokenizer -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.large -class OPTPresetSmokeTest(TestCase): - """ - A smoke test for GPT-2 presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/opt/opt_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = OPTTokenizer.from_preset("opt_125m_en") - outputs = tokenizer("The quick brown fox.") - expected_outputs = [133, 2119, 6219, 23602, 4] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "token_ids": ops.array([[133, 2119, 6219, 23602, 4]]), - "padding_mask": ops.array([[1, 1, 1, 1, 1]]), - } - model = OPTBackbone.from_preset( - "opt_125m_en", load_weights=load_weights - ) - outputs = model(input_data)[0, 0, :5] - if load_weights: - # The forward pass from a preset should be stable! - # This test should catch cases where we unintentionally change our - # network code in a way that would invalidate our preset weights. - # We should only update these numbers if we are updating a weights - # file, or have found a discrepancy with the upstream source. - expected_outputs = [-0.246, -1.004, -0.072, 0.097, 0.533] - # Keep a high tolerance, so we are robust to different hardware. - self.assertAllClose(outputs, expected_outputs, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("opt_tokenizer", OPTTokenizer), - ("opt_backbone", OPTBackbone), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("opt_tokenizer", OPTTokenizer), - ("opt_backbone", OPTBackbone), - ) - def test_unknown_preset_error(self, cls): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("opt_clowntown") - - -@pytest.mark.extra_large -class OPTPresetFullTest(TestCase): - """ - Test the full enumeration of our preset. - - This tests every GPT-2 preset and is only run manually. - Run with: - `pytest keras_nlp/models/opt/opt_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_opt(self, load_weights): - for preset in OPTBackbone.presets: - model = OPTBackbone.from_preset(preset, load_weights=load_weights) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 1024), - dtype="int64", - maxval=model.vocabulary_size, - ), - "padding_mask": ops.array([1] * 1024, shape=(1, 1024)), - } - model(input_data) - - def test_load_tokenizers(self): - for preset in OPTTokenizer.presets: - tokenizer = OPTTokenizer.from_preset(preset) - tokenizer("The quick brown fox.") diff --git a/keras_nlp/models/opt/opt_tokenizer.py b/keras_nlp/models/opt/opt_tokenizer.py index b15aa94842..4fb62ee73a 100644 --- a/keras_nlp/models/opt/opt_tokenizer.py +++ b/keras_nlp/models/opt/opt_tokenizer.py @@ -70,35 +70,45 @@ class OPTTokenizer(BytePairTokenizer): def __init__( self, - vocabulary, - merges, + vocabulary=None, + merges=None, **kwargs, ): - # Special tokens. We use `""` as both a start and end token, as OPT - # was only pre-trained with `""` marking document boundaries. - start_token = "" - pad_token = "" - end_token = "" + self.start_token = "" + self.pad_token = "" + self.end_token = "" super().__init__( vocabulary=vocabulary, merges=merges, - unsplittable_tokens=[start_token, pad_token, end_token], + unsplittable_tokens=[ + self.start_token, + self.pad_token, + self.end_token, + ], **kwargs, ) - # Check whether special tokens are present in the vocabulary. - for token in [start_token, pad_token, end_token]: - if token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{token}'` in the provided " - f"`vocabulary`. Please provide `'{token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.start_token_id = self.token_to_id(start_token) - self.pad_token_id = self.token_to_id(pad_token) - self.end_token_id = self.token_to_id(end_token) + def set_vocabulary_and_merges(self, vocabulary, merges): + super().set_vocabulary_and_merges(vocabulary, merges) + + if vocabulary is not None: + # Check for necessary special tokens. + for token in [self.start_token, self.pad_token, self.end_token]: + if token not in self.vocabulary: + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.start_token_id = self.token_to_id(self.start_token) + self.pad_token_id = self.token_to_id(self.pad_token) + self.end_token_id = self.token_to_id(self.end_token) + else: + self.start_token_id = None + self.pad_token_id = None + self.end_token_id = None @classproperty def presets(cls): diff --git a/keras_nlp/models/opt/opt_tokenizer_test.py b/keras_nlp/models/opt/opt_tokenizer_test.py index af460db409..4b52ef1aed 100644 --- a/keras_nlp/models/opt/opt_tokenizer_test.py +++ b/keras_nlp/models/opt/opt_tokenizer_test.py @@ -12,66 +12,51 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend import keras +import pytest + from keras_nlp.models.opt.opt_tokenizer import OPTTokenizer from keras_nlp.tests.test_case import TestCase class OPTTokenizerTest(TestCase): def setUp(self): - self.vocab = { - "": 0, - "": 1, - "Ġair": 2, - "plane": 3, - "Ġat": 4, - "port": 5, - "Ġkoh": 6, - "li": 7, - "Ġis": 8, - "Ġthe": 9, - "Ġbest": 10, - } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - self.merges = merges - - self.tokenizer = OPTTokenizer(vocabulary=self.vocab, merges=self.merges) - - def test_tokenize(self): - input_data = " airplane at airport" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [2, 3, 4, 2, 5]) - - def test_tokenize_special_tokens(self): - input_data = " airplane at airport" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [1, 2, 3, 4, 2, 5, 1, 0]) - - def test_tokenize_batch(self): - input_data = [" airplane at airport", " kohli is the best"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[2, 3, 4, 2, 5], [6, 7, 8, 9, 10]]) - - def test_detokenize(self): - input_tokens = [2, 3, 4, 2, 5] - output = self.tokenizer.detokenize(input_tokens) - self.assertEqual(output, " airplane at airport") - - def test_vocabulary_size(self): - self.assertEqual(self.tokenizer.vocabulary_size(), 11) + self.vocab = ["", "", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} + self.input_data = [ + " airplane at airport", + " airplane airport", + ] + + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=OPTTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[3, 4, 5, 3, 6, 1], [3, 4, 3, 6]], + ) def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): OPTTokenizer(vocabulary=["a", "b", "c"], merges=[]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=OPTTokenizer, + preset="opt_125m_en", + input_data=["The quick brown fox."], + expected_output=[[133, 2119, 6219, 23602, 4]], ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in OPTTokenizer.presets: + self.run_preset_test( + cls=OPTTokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/preprocessor.py b/keras_nlp/models/preprocessor.py index b5ea48a09b..16a65e57c2 100644 --- a/keras_nlp/models/preprocessor.py +++ b/keras_nlp/models/preprocessor.py @@ -16,6 +16,8 @@ from keras_nlp.layers.preprocessing.preprocessing_layer import ( PreprocessingLayer, ) +from keras_nlp.utils.preset_utils import check_preset_class +from keras_nlp.utils.preset_utils import load_from_preset from keras_nlp.utils.python_utils import classproperty from keras_nlp.utils.python_utils import format_docstring @@ -81,44 +83,18 @@ def from_preset( ) ``` """ - if not cls.presets: - raise NotImplementedError( - "No presets have been created for this class." - ) - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - - tokenizer = cls.tokenizer_cls.from_preset(preset) - - metadata = cls.presets[preset] - # For task model presets, the backbone config is nested. - if "backbone" in metadata["config"]: - backbone_config = metadata["config"]["backbone"]["config"] - else: - backbone_config = metadata["config"] - - # Use model's `max_sequence_length` if `sequence_length` unspecified; - # otherwise check that `sequence_length` not too long. - sequence_length = kwargs.pop("sequence_length", None) - max_sequence_length = backbone_config["max_sequence_length"] - if sequence_length is not None: - if sequence_length > max_sequence_length: - raise ValueError( - f"`sequence_length` cannot be longer than `{preset}` " - f"preset's `max_sequence_length` of {max_sequence_length}. " - f"Received: {sequence_length}." - ) - else: - sequence_length = max_sequence_length - - return cls( - tokenizer=tokenizer, - sequence_length=sequence_length, - **kwargs, + # We support short IDs for official presets, e.g. `"bert_base_en"`. + # Map these to a Kaggle Models handle. + if preset in cls.presets: + preset = cls.presets[preset]["kaggle_handle"] + + config_file = "tokenizer.json" + check_preset_class(preset, cls.tokenizer_cls, config_file=config_file) + tokenizer = load_from_preset( + preset, + config_file=config_file, ) + return cls(tokenizer=tokenizer, **kwargs) def __init_subclass__(cls, **kwargs): # Use __init_subclass__ to setup a correct docstring for from_preset. diff --git a/keras_nlp/models/roberta/roberta_backbone_test.py b/keras_nlp/models/roberta/roberta_backbone_test.py index 9a466b527f..fe85e183a8 100644 --- a/keras_nlp/models/roberta/roberta_backbone_test.py +++ b/keras_nlp/models/roberta/roberta_backbone_test.py @@ -12,13 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.roberta.roberta_backbone import RobertaBackbone from keras_nlp.tests.test_case import TestCase @@ -26,92 +21,58 @@ class RobertaBackboneTest(TestCase): def setUp(self): - self.backbone = RobertaBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - self.batch_size = 8 - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "segment_ids": ops.zeros((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_roberta(self): - self.backbone(self.input_batch) - - def test_token_embedding(self): - output = self.backbone.token_embedding(self.input_batch["token_ids"]) - self.assertEqual(output.shape, (2, 5, 2)) - - def test_name(self): - self.assertRegexpMatches(self.backbone.name, "roberta_backbone") - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=RobertaBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 2), ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) - def test_variable_sequence_length_call_roberta(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "padding_mask": np.ones((2, seq_length), dtype="int32"), - } - output = self.backbone(input_data) - self.assertAllEqual( - ops.shape(output), - (2, seq_length, self.backbone.hidden_dim), - ) - - @pytest.mark.large # Saving is slow, so mark these large. + @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, RobertaBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose(model_output, restored_output) + self.run_model_saving_test( + cls=RobertaBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=RobertaBackbone, + preset="roberta_base_en", + input_data={ + "token_ids": ops.array([[0, 133, 2119, 2]], dtype="int32"), + "segment_ids": ops.zeros((1, 4), dtype="int32"), + "padding_mask": ops.ones((1, 4), dtype="int32"), + }, + expected_output_shape=(1, 4, 768), + # The forward pass from a preset should be stable! + expected_partial_output=ops.array( + [-0.051, 0.100, -0.010, -0.097, 0.059], + ), + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class RobertaBackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = RobertaBackbone( - vocabulary_size=1000, - num_layers=2, - num_heads=2, - hidden_dim=64, - intermediate_dim=128, - max_sequence_length=128, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in RobertaBackbone.presets: + self.run_preset_test( + cls=RobertaBackbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "token_ids": np.ones((8, 128), dtype="int32"), - "padding_mask": np.ones((8, 128), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) diff --git a/keras_nlp/models/roberta/roberta_classifier_test.py b/keras_nlp/models/roberta/roberta_classifier_test.py index 6636768bdc..e85d2a3703 100644 --- a/keras_nlp/models/roberta/roberta_classifier_test.py +++ b/keras_nlp/models/roberta/roberta_classifier_test.py @@ -12,14 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras -from keras_nlp.backend import ops from keras_nlp.models.roberta.roberta_backbone import RobertaBackbone from keras_nlp.models.roberta.roberta_classifier import RobertaClassifier from keras_nlp.models.roberta.roberta_preprocessor import RobertaPreprocessor @@ -29,27 +23,13 @@ class RobertaClassifierTest(TestCase): def setUp(self): - self.vocab = { - "": 0, - "": 1, - "": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, - "Ġkoh": 7, - "li": 8, - "Ġis": 9, - "Ġthe": 10, - "Ġbest": 11, - "": 12, - } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - self.merges = merges + # Setup model. + self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] + self.vocab += ["port", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] self.preprocessor = RobertaPreprocessor( RobertaTokenizer(vocabulary=self.vocab, merges=self.merges), sequence_length=5, @@ -60,86 +40,42 @@ def setUp(self): num_heads=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.classifier = RobertaClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - # Check we handle serialization correctly. - activation=keras.activations.softmax, - hidden_dim=4, + max_sequence_length=self.preprocessor.sequence_length, ) - - # Setup data. - self.raw_batch = [ - " airplane at airport", - " the airplane is the best", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - (self.raw_batch, np.ones((2,))) - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.classifier(self.preprocessed_batch) - - def test_classifier_predict(self): - preds1 = self.classifier.predict(self.raw_batch) - self.classifier.preprocessor = None - preds2 = self.classifier.predict(self.preprocessed_batch) - # Assert predictions match. - self.assertAllClose(preds1, preds2) - # Assert valid softmax output. - self.assertAllClose(ops.sum(preds2, axis=-1), [1.0, 1.0]) - - def test_classifier_fit(self): - self.classifier.fit(self.raw_dataset) - self.classifier.preprocessor = None - self.classifier.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.classifier.preprocessor = None - self.classifier.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + "num_classes": 2, + } + self.train_data = ( + [" airplane at airport", " airplane airport"], # Features. + [1, 0], # Labels. ) - self.classifier.fit(self.preprocessed_dataset) + self.input_data = self.preprocessor(*self.train_data)[0] - def test_serialization(self): - # Defaults. - original = RobertaClassifier( - self.backbone, - num_classes=2, + def test_classifier_basics(self): + self.run_task_test( + cls=RobertaClassifier, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 2), ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) - # With options. - original = RobertaClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - activation=keras.activations.softmax, - hidden_dim=4, - name="test", - trainable=False, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) - @pytest.mark.large # Saving is slow, so mark these large. + @pytest.mark.large def test_saved_model(self): - model_output = self.classifier.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.classifier.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, RobertaClassifier) + self.run_model_saving_test( + cls=RobertaClassifier, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in RobertaClassifier.presets: + self.run_preset_test( + cls=RobertaClassifier, + preset=preset, + init_kwargs={"num_classes": 2}, + input_data=self.input_data, + expected_output_shape=(2, 2), + ) diff --git a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py index 0d5a24e129..c69c300dc8 100644 --- a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py +++ b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py @@ -137,32 +137,30 @@ def __init__( truncate=truncate, **kwargs, ) - + self.mask_selection_rate = mask_selection_rate + self.mask_selection_length = mask_selection_length + self.mask_token_rate = mask_token_rate + self.random_token_rate = random_token_rate + self.masker = None + + def build(self, input_shape): + super().build(input_shape) + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.masker = MaskedLMMaskGenerator( - mask_selection_rate=mask_selection_rate, - mask_selection_length=mask_selection_length, - mask_token_rate=mask_token_rate, - random_token_rate=random_token_rate, - vocabulary_size=tokenizer.vocabulary_size(), - mask_token_id=tokenizer.mask_token_id, + mask_selection_rate=self.mask_selection_rate, + mask_selection_length=self.mask_selection_length, + mask_token_rate=self.mask_token_rate, + random_token_rate=self.random_token_rate, + vocabulary_size=self.tokenizer.vocabulary_size(), + mask_token_id=self.tokenizer.mask_token_id, unselectable_token_ids=[ - tokenizer.start_token_id, - tokenizer.end_token_id, - tokenizer.pad_token_id, + self.tokenizer.start_token_id, + self.tokenizer.end_token_id, + self.tokenizer.pad_token_id, ], ) - - def get_config(self): - config = super().get_config() - config.update( - { - "mask_selection_rate": self.masker.mask_selection_rate, - "mask_selection_length": self.masker.mask_selection_length, - "mask_token_rate": self.masker.mask_token_rate, - "random_token_rate": self.masker.random_token_rate, - } - ) - return config + self.built = True def call(self, x, y=None, sample_weight=None): if y is not None or sample_weight is not None: @@ -184,3 +182,15 @@ def call(self, x, y=None, sample_weight=None): y = masker_outputs["mask_ids"] sample_weight = masker_outputs["mask_weights"] return pack_x_y_sample_weight(x, y, sample_weight) + + def get_config(self): + config = super().get_config() + config.update( + { + "mask_selection_rate": self.mask_selection_rate, + "mask_selection_length": self.mask_selection_length, + "mask_token_rate": self.mask_token_rate, + "random_token_rate": self.random_token_rate, + } + ) + return config diff --git a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor_test.py b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor_test.py index 92abcf3011..ae762079e2 100644 --- a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor_test.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KerasNLP Authors +# Copyright 2023 The KerasNLP Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.roberta.roberta_masked_lm_preprocessor import ( RobertaMaskedLMPreprocessor, ) @@ -24,122 +23,68 @@ class RobertaMaskedLMPreprocessorTest(TestCase): def setUp(self): - vocab = { - "": 0, - "": 1, - "": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, - "Ġkoh": 7, - "li": 8, - "Ġis": 9, - "Ġthe": 10, - "Ġbest": 11, - "": 12, + self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] + self.vocab += ["port", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.tokenizer = RobertaTokenizer( + vocabulary=self.vocab, merges=self.merges + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + # Simplify our testing by masking every available token. + "mask_selection_rate": 1.0, + "mask_token_rate": 1.0, + "random_token_rate": 0.0, + "mask_selection_length": 4, + "sequence_length": 12, } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - - self.preprocessor = RobertaMaskedLMPreprocessor( - tokenizer=RobertaTokenizer( - vocabulary=vocab, - merges=merges, + self.input_data = [" airplane airport"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=RobertaMaskedLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[0, 8, 8, 8, 8, 2, 1, 1, 1, 1, 1, 1]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[1, 2, 3, 4]], + }, + [[4, 5, 4, 7]], + [[1.0, 1.0, 1.0, 1.0]], ), - # Simplify our testing by masking every available token. - mask_selection_rate=1.0, - mask_token_rate=1.0, - random_token_rate=0.0, - mask_selection_length=5, - sequence_length=12, ) - def test_preprocess_strings(self): - input_data = " airplane at airport" - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [0, 12, 12, 12, 12, 12, 2, 1, 1, 1, 1, 1] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [1, 2, 3, 4, 5]) - self.assertAllEqual(y, [3, 4, 5, 3, 6]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0, 1.0]) - - def test_preprocess_list_of_strings(self): - input_data = [" airplane at airport"] * 4 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [[0, 12, 12, 12, 12, 12, 2, 1, 1, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4, 5]] * 4) - self.assertAllEqual(y, [[3, 4, 5, 3, 6]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0, 1.0]] * 4) - - def test_preprocess_dataset(self): - sentences = tf.constant([" airplane at airport"] * 4) - ds = tf.data.Dataset.from_tensor_slices(sentences) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x["token_ids"], [[0, 12, 12, 12, 12, 12, 2, 1, 1, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 4, 5]] * 4) - self.assertAllEqual(y, [[3, 4, 5, 3, 6]] * 4) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 1.0, 1.0]] * 4) - - def test_mask_multiple_sentences(self): - sentence_one = tf.constant(" airplane") - sentence_two = tf.constant(" kohli") - - x, y, sw = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - x["token_ids"], [0, 12, 12, 2, 2, 12, 12, 2, 1, 1, 1, 1] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [1, 2, 5, 6, 0]) - self.assertAllEqual(y, [3, 4, 7, 8, 0]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 1.0, 0.0]) - def test_no_masking_zero_rate(self): no_mask_preprocessor = RobertaMaskedLMPreprocessor( - self.preprocessor.tokenizer, + self.tokenizer, mask_selection_rate=0.0, - mask_selection_length=5, + mask_selection_length=4, sequence_length=12, ) - input_data = " airplane at airport" - - x, y, sw = no_mask_preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [0, 3, 4, 5, 3, 6, 2, 1, 1, 1, 1, 1] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] + input_data = [" airplane airport"] + self.assertAllClose( + no_mask_preprocessor(input_data), + ( + { + "token_ids": [[0, 4, 5, 4, 7, 2, 1, 1, 1, 1, 1, 1]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[0, 0, 0, 0]], + }, + [[0, 0, 0, 0]], + [[0.0, 0.0, 0.0, 0.0]], + ), ) - self.assertAllEqual(x["mask_positions"], [0, 0, 0, 0, 0]) - self.assertAllEqual(y, [0, 0, 0, 0, 0]) - self.assertAllEqual(sw, [0.0, 0.0, 0.0, 0.0, 0.0]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in RobertaMaskedLMPreprocessor.presets: + self.run_preset_test( + cls=RobertaMaskedLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/roberta/roberta_masked_lm_test.py b/keras_nlp/models/roberta/roberta_masked_lm_test.py index d2a5a27011..f4e410fa69 100644 --- a/keras_nlp/models/roberta/roberta_masked_lm_test.py +++ b/keras_nlp/models/roberta/roberta_masked_lm_test.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KerasNLP Authors +# Copyright 2023 The KerasNLP Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,12 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.roberta.roberta_backbone import RobertaBackbone from keras_nlp.models.roberta.roberta_masked_lm import RobertaMaskedLM from keras_nlp.models.roberta.roberta_masked_lm_preprocessor import ( @@ -29,27 +25,13 @@ class RobertaMaskedLMTest(TestCase): def setUp(self): - self.vocab = { - "": 0, - "": 1, - "": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, - "Ġkoh": 7, - "li": 8, - "Ġis": 9, - "Ġthe": 10, - "Ġbest": 11, - "": 12, - } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - self.merges = merges + # Setup model. + self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] + self.vocab += ["port", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] self.preprocessor = RobertaMaskedLMPreprocessor( RobertaTokenizer(vocabulary=self.vocab, merges=self.merges), # Simplify our testing by masking every available token. @@ -65,66 +47,38 @@ def setUp(self): num_heads=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, - ) - self.masked_lm = RobertaMaskedLM( - self.backbone, - preprocessor=self.preprocessor, + max_sequence_length=self.preprocessor.sequence_length, ) - self.masked_lm_no_preprocessing = RobertaMaskedLM( - self.backbone, - preprocessor=None, - ) - - self.raw_batch = [ - " airplane at airport", - " the airplane is the best", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.masked_lm(self.preprocessed_batch[0]) - - def test_classifier_predict(self): - self.masked_lm.predict(self.raw_batch) - self.masked_lm.preprocessor = None - self.masked_lm.predict(self.preprocessed_batch[0]) - - def test_classifier_fit(self): - self.masked_lm.fit(self.raw_dataset) - self.masked_lm.preprocessor = None - self.masked_lm.fit(self.preprocessed_dataset) - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.masked_lm) - new_classifier = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_classifier.get_config(), - self.masked_lm.get_config(), + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ( + [" airplane at airport", " airplane_airport"], # Features. ) + self.input_data = self.preprocessor(*self.train_data)[0] - def test_classifier_fit_no_xla(self): - self.masked_lm.preprocessor = None - self.masked_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, + def test_masked_lm_basics(self): + self.run_task_test( + cls=RobertaMaskedLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 5, 9), ) - self.masked_lm.fit(self.preprocessed_dataset) @pytest.mark.large def test_saved_model(self): - model_output = self.masked_lm.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.masked_lm.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, RobertaMaskedLM) + self.run_model_saving_test( + cls=RobertaMaskedLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output, atol=0.01, rtol=0.01) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in RobertaMaskedLM.presets: + self.run_preset_test( + cls=RobertaMaskedLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/roberta/roberta_preprocessor.py b/keras_nlp/models/roberta/roberta_preprocessor.py index 7e641a93a8..556561d17c 100644 --- a/keras_nlp/models/roberta/roberta_preprocessor.py +++ b/keras_nlp/models/roberta/roberta_preprocessor.py @@ -143,24 +143,22 @@ def __init__( super().__init__(**kwargs) self.tokenizer = tokenizer + self.truncate = truncate + self.sequence_length = sequence_length + self.packer = None + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = MultiSegmentPacker( start_value=self.tokenizer.start_token_id, end_value=self.tokenizer.end_token_id, sep_value=[self.tokenizer.end_token_id] * 2, pad_value=self.tokenizer.pad_token_id, - truncate=truncate, - sequence_length=sequence_length, + truncate=self.truncate, + sequence_length=self.sequence_length, ) - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.packer.sequence_length, - "truncate": self.packer.truncate, - } - ) - return config + self.built = True def call(self, x, y=None, sample_weight=None): x = convert_inputs_to_list_of_tensor_segments(x) @@ -172,6 +170,16 @@ def call(self, x, y=None, sample_weight=None): } return pack_x_y_sample_weight(x, y, sample_weight) + def get_config(self): + config = super().get_config() + config.update( + { + "sequence_length": self.sequence_length, + "truncate": self.truncate, + } + ) + return config + @classproperty def tokenizer_cls(cls): return RobertaTokenizer diff --git a/keras_nlp/models/roberta/roberta_preprocessor_test.py b/keras_nlp/models/roberta/roberta_preprocessor_test.py index 3471fd372d..5e7ad77514 100644 --- a/keras_nlp/models/roberta/roberta_preprocessor_test.py +++ b/keras_nlp/models/roberta/roberta_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.roberta.roberta_preprocessor import RobertaPreprocessor from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer from keras_nlp.tests.test_case import TestCase @@ -22,123 +21,51 @@ class RobertaPreprocessorTest(TestCase): def setUp(self): - vocab = { - "": 0, - "": 1, - "": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, - "Ġkoh": 7, - "li": 8, - "Ġis": 9, - "Ġthe": 10, - "Ġbest": 11, - "": 12, - } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - - self.preprocessor = RobertaPreprocessor( - tokenizer=RobertaTokenizer( - vocabulary=vocab, - merges=merges, - ), - sequence_length=12, - ) - - def test_tokenize_strings(self): - input_data = " airplane at airport" - - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], [0, 3, 4, 5, 3, 6, 2, 1, 1, 1, 1, 1] - ) - self.assertAllEqual( - output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] - ) - - def test_tokenize_list_of_strings(self): - input_data = [" airplane at airport"] * 4 - - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], - [[0, 3, 4, 5, 3, 6, 2, 1, 1, 1, 1, 1]] * 4, - ) - - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]] * 4 - ) - - def test_tokenize_labeled_batch(self): - x = tf.constant([" airplane at airport"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - x_out, y_out, sw_out = self.preprocessor(x, y, sw) - self.assertAllEqual( - x_out["token_ids"], [[0, 3, 4, 5, 3, 6, 2, 1, 1, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_labeled_dataset(self): - x = tf.constant([" airplane at airport"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - ds = tf.data.Dataset.from_tensor_slices((x, y, sw)) - ds = ds.map(self.preprocessor) - x_out, y_out, sw_out = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x_out["token_ids"], [[0, 3, 4, 5, 3, 6, 2, 1, 1, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_multiple_sentences(self): - sentence_one = tf.constant(" airplane at airport") - sentence_two = tf.constant(" kohli is the best") - - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], [0, 3, 4, 5, 3, 2, 2, 7, 8, 9, 10, 2] + self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] + self.vocab += ["port", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.tokenizer = RobertaTokenizer( + vocabulary=self.vocab, merges=self.merges ) - self.assertAllEqual( - output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ( + [" airplane at airport"], + [1], # Pass through labels. + [1.0], # Pass through sample_weights. ) - def test_tokenize_multiple_batched_sentences(self): - sentence_one = tf.constant([" airplane at airport"] * 4) - sentence_two = tf.constant([" kohli is the best"] * 4) - - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], - [[0, 3, 4, 5, 3, 2, 2, 7, 8, 9, 10, 2]] * 4, - ) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]] * 4 + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=RobertaPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[0, 4, 5, 6, 4, 7, 2, 1]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ), ) def test_errors_for_2d_list_input(self): + preprocessor = RobertaPreprocessor(**self.init_kwargs) ambiguous_input = [["one", "two"], ["three", "four"]] with self.assertRaises(ValueError): - self.preprocessor(ambiguous_input) - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + preprocessor(ambiguous_input) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in RobertaPreprocessor.presets: + self.run_preset_test( + cls=RobertaPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/roberta/roberta_presets.py b/keras_nlp/models/roberta/roberta_presets.py index f098bed5d7..66848cecd0 100644 --- a/keras_nlp/models/roberta/roberta_presets.py +++ b/keras_nlp/models/roberta/roberta_presets.py @@ -25,22 +25,7 @@ "path": "roberta", "model_card": "https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/README.md", }, - "config": { - "vocabulary_size": 50265, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/roberta_base_en/v1/model.h5", - "weights_hash": "958eede1c7edaa9308e027be18fde7a8", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/roberta_base_en/v1/vocab.json", - "vocabulary_hash": "be4d3c6f3f5495426b2c03b334334354", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/roberta_base_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/roberta/keras/roberta_base_en/2", }, "roberta_large_en": { "metadata": { @@ -53,21 +38,6 @@ "path": "roberta", "model_card": "https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/README.md", }, - "config": { - "vocabulary_size": 50265, - "num_layers": 24, - "num_heads": 16, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "dropout": 0.1, - "max_sequence_length": 512, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/roberta_large_en/v1/model.h5", - "weights_hash": "1978b864c317a697fe62a894d3664f14", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/roberta_large_en/v1/vocab.json", - "vocabulary_hash": "be4d3c6f3f5495426b2c03b334334354", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/roberta_large_en/v1/merges.txt", - "merges_hash": "75a37753dd7a28a2c5df80c28bf06e4e", + "kaggle_handle": "kaggle://keras/roberta/keras/roberta_large_en/2", }, } diff --git a/keras_nlp/models/roberta/roberta_presets_test.py b/keras_nlp/models/roberta/roberta_presets_test.py deleted file mode 100644 index 22c1f2e8a6..0000000000 --- a/keras_nlp/models/roberta/roberta_presets_test.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from absl.testing import parameterized - -from keras_nlp.backend import ops -from keras_nlp.models.roberta.roberta_backbone import RobertaBackbone -from keras_nlp.models.roberta.roberta_classifier import RobertaClassifier -from keras_nlp.models.roberta.roberta_masked_lm import RobertaMaskedLM -from keras_nlp.models.roberta.roberta_preprocessor import RobertaPreprocessor -from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.large -class RobertaPresetSmokeTest(TestCase): - """ - A smoke test for RoBERTa presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/roberta/roberta_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = RobertaTokenizer.from_preset( - "roberta_base_en", - ) - outputs = tokenizer("The quick brown fox.") - expected_outputs = [133, 2119, 6219, 23602, 4] - self.assertAllEqual(outputs, expected_outputs) - - def test_preprocessor_output(self): - preprocessor = RobertaPreprocessor.from_preset( - "roberta_base_en", - sequence_length=4, - ) - outputs = preprocessor("The quick brown fox.")["token_ids"] - expected_outputs = [0, 133, 2119, 2] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "token_ids": ops.array([[0, 133, 2119, 2]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = RobertaBackbone.from_preset( - "roberta_base_en", load_weights=load_weights - ) - outputs = model(input_data) - if load_weights: - outputs = outputs[0, 0, :5] - expected = [-0.051, 0.100, -0.010, -0.097, 0.059] - self.assertAllClose(outputs, expected, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_classifier_output(self, load_weights): - input_data = ["Let's rock!"] - model = RobertaClassifier.from_preset( - "roberta_base_en", num_classes=2, load_weights=load_weights - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_classifier_output_without_preprocessing(self, load_weights): - input_data = { - "token_ids": ops.array([[101, 1996, 4248, 102]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = RobertaClassifier.from_preset( - "roberta_base_en", - num_classes=2, - load_weights=load_weights, - preprocessor=None, - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_masked_lm_output(self, load_weights): - input_data = ["Let's rock!"] - model = RobertaMaskedLM.from_preset( - "roberta_base_en", load_weights=load_weights - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_masked_lm_output_without_preprocessing(self, load_weights): - input_data = { - "token_ids": ops.array([[101, 1996, 4248, 102]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - "mask_positions": ops.array([[0, 0]]), - } - model = RobertaMaskedLM.from_preset( - "roberta_base_en", - load_weights=load_weights, - preprocessor=None, - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("roberta_tokenizer", RobertaTokenizer), - ("roberta_preprocessor", RobertaPreprocessor), - ("roberta", RobertaBackbone), - ("roberta_classifier", RobertaClassifier), - ("roberta_masked_lm", RobertaMaskedLM), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("roberta_tokenizer", RobertaTokenizer, {}), - ("roberta_preprocessor", RobertaPreprocessor, {}), - ("roberta", RobertaBackbone, {}), - ("roberta_classifier", RobertaClassifier, {"num_classes": 2}), - ("roberta_masked_lm", RobertaMaskedLM, {}), - ) - def test_unknown_preset_error(self, cls, kwargs): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("roberta_base_en_clowntown", **kwargs) - - -@pytest.mark.extra_large -class RobertaPresetFullTest(TestCase): - """ - Test the full enumeration of our preset. - - This tests every RoBERTa preset and is only run manually. - Run with: - `pytest keras_nlp/models/roberta/roberta_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_roberta(self, load_weights): - for preset in RobertaBackbone.presets: - model = RobertaBackbone.from_preset( - preset, load_weights=load_weights - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), dtype="int64", maxval=model.vocabulary_size - ), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - model(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_roberta_classifier(self, load_weights): - for preset in RobertaClassifier.presets: - classifier = RobertaClassifier.from_preset( - preset, num_classes=4, load_weights=load_weights - ) - input_data = ["The quick brown fox."] - classifier.predict(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_load_roberta_classifier_without_preprocessing(self, load_weights): - for preset in RobertaClassifier.presets: - classifier = RobertaClassifier.from_preset( - preset, - num_classes=2, - preprocessor=None, - load_weights=load_weights, - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), - dtype="int64", - maxval=classifier.backbone.vocabulary_size, - ), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - classifier.predict(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_roberta_masked_lm(self, load_weights): - for preset in RobertaMaskedLM.presets: - classifier = RobertaMaskedLM.from_preset( - preset, load_weights=load_weights - ) - input_data = ["The quick brown fox."] - classifier.predict(input_data) - - @parameterized.named_parameters( - ("load_weights", True), ("no_load_weights", False) - ) - def test_load_roberta_masked_lm_without_preprocessing(self, load_weights): - for preset in RobertaMaskedLM.presets: - classifier = RobertaMaskedLM.from_preset( - preset, - preprocessor=None, - load_weights=load_weights, - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), - dtype="int64", - maxval=classifier.backbone.vocabulary_size, - ), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - "mask_positions": ops.array([1] * 128, shape=(1, 128)), - } - classifier.predict(input_data) - - def test_load_tokenizers(self): - for preset in RobertaTokenizer.presets: - tokenizer = RobertaTokenizer.from_preset(preset) - tokenizer("The quick brown fox.") - - def test_load_preprocessors(self): - for preset in RobertaPreprocessor.presets: - preprocessor = RobertaPreprocessor.from_preset(preset) - preprocessor("The quick brown fox.") diff --git a/keras_nlp/models/roberta/roberta_tokenizer.py b/keras_nlp/models/roberta/roberta_tokenizer.py index 838f32a8ac..0cfabff754 100644 --- a/keras_nlp/models/roberta/roberta_tokenizer.py +++ b/keras_nlp/models/roberta/roberta_tokenizer.py @@ -77,36 +77,54 @@ class RobertaTokenizer(BytePairTokenizer): def __init__( self, - vocabulary, - merges, + vocabulary=None, + merges=None, **kwargs, ): - # Special tokens. - start_token = "" - pad_token = "" - end_token = "" - mask_token = "" + self.start_token = "" + self.pad_token = "" + self.end_token = "" + self.mask_token = "" super().__init__( vocabulary=vocabulary, merges=merges, - unsplittable_tokens=[start_token, pad_token, end_token, mask_token], + unsplittable_tokens=[ + self.start_token, + self.pad_token, + self.end_token, + self.mask_token, + ], **kwargs, ) - # Check whether special tokens are present in the vocabulary. - for token in [start_token, pad_token, end_token, mask_token]: - if token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{token}'` in the provided " - f"`vocabulary`. Please provide `'{token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.start_token_id = self.token_to_id(start_token) - self.pad_token_id = self.token_to_id(pad_token) - self.end_token_id = self.token_to_id(end_token) - self.mask_token_id = self.token_to_id(mask_token) + def set_vocabulary_and_merges(self, vocabulary, merges): + super().set_vocabulary_and_merges(vocabulary, merges) + + if vocabulary is not None: + # Check for necessary special tokens. + for token in [ + self.start_token, + self.pad_token, + self.end_token, + self.mask_token, + ]: + if token not in self.vocabulary: + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.start_token_id = self.token_to_id(self.start_token) + self.pad_token_id = self.token_to_id(self.pad_token) + self.end_token_id = self.token_to_id(self.end_token) + self.mask_token_id = self.token_to_id(self.mask_token) + else: + self.start_token_id = None + self.pad_token_id = None + self.end_token_id = None + self.mask_token_id = None @classproperty def presets(cls): diff --git a/keras_nlp/models/roberta/roberta_tokenizer_test.py b/keras_nlp/models/roberta/roberta_tokenizer_test.py index ef47f204eb..3b2305608d 100644 --- a/keras_nlp/models/roberta/roberta_tokenizer_test.py +++ b/keras_nlp/models/roberta/roberta_tokenizer_test.py @@ -12,67 +12,57 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.backend import keras +import pytest + from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer from keras_nlp.tests.test_case import TestCase class RobertaTokenizerTest(TestCase): def setUp(self): - vocab = { - "": 0, - "": 1, - "": 2, - "Ġair": 3, - "plane": 4, - "Ġat": 5, - "port": 6, - "Ġkoh": 7, - "li": 8, - "Ġis": 9, - "Ġthe": 10, - "Ġbest": 11, - "": 12, - } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - - self.tokenizer = RobertaTokenizer(vocabulary=vocab, merges=merges) - - def test_tokenize(self): - input_data = " airplane at airport" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [3, 4, 5, 3, 6]) - - def test_tokenize_special_tokens(self): - input_data = " airplane at airport" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [0, 3, 4, 5, 3, 6, 0, 1]) + self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] + self.vocab += ["port", ""] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] + self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges} + self.input_data = [ + " airplane at airport", + " airplane airport", + ] - def test_tokenize_batch(self): - input_data = [" airplane at airport", " kohli is the best"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[3, 4, 5, 3, 6], [7, 8, 9, 10, 11]]) - - def test_detokenize(self): - input_tokens = [[3, 4, 5, 3, 6]] - output = self.tokenizer.detokenize(input_tokens) - self.assertAllEqual(output, [" airplane at airport"]) - - def test_vocabulary_size(self): - self.assertEqual(self.tokenizer.vocabulary_size(), 13) + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=RobertaTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + # TODO: should not get tokenized as + expected_output=[[0, 4, 5, 6, 4, 7, 0, 1], [4, 5, 4, 7]], + expected_detokenize_output=[ + " airplane at airport", + " airplane airport", + ], + ) def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): RobertaTokenizer(vocabulary=["a", "b", "c"], merges=[]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=RobertaTokenizer, + preset="roberta_base_en", + input_data=["The quick brown fox."], + expected_output=[[133, 2119, 6219, 23602, 4]], ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in RobertaTokenizer.presets: + self.run_preset_test( + cls=RobertaTokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/t5/t5_backbone.py b/keras_nlp/models/t5/t5_backbone.py index 7514cc51ae..2df5bd00bb 100644 --- a/keras_nlp/models/t5/t5_backbone.py +++ b/keras_nlp/models/t5/t5_backbone.py @@ -11,18 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy -from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding from keras_nlp.models.backbone import Backbone from keras_nlp.models.t5.t5_layer_norm import T5LayerNorm +from keras_nlp.models.t5.t5_presets import backbone_presets from keras_nlp.models.t5.t5_transformer_layer import T5TransformerLayer from keras_nlp.utils.python_utils import classproperty -from keras_nlp.utils.tensor_utils import assert_tf_backend -@keras_nlp_export("keras_nlp.models.T5Backbone") +@keras.saving.register_keras_serializable(package="keras_nlp") class T5Backbone(Backbone): """T5 encoder-decoder backbone model. @@ -51,11 +51,13 @@ class T5Backbone(Backbone): hidden_dim: int. The hidden size of the Transformer layers. intermediate_dim: int. The output dimension of the first Dense layer in a two-layer feedforward network for each Transformer layer. + key_value_dim: int. The dimension of each head of the key/value + projections in the multi-head attention layers. Defaults to + hidden_dim / num_heads. dropout: float. Dropout probability for the Transformer layers. activation: activation function (or activation string name). The activation to be used in the inner dense blocks of the - Transformer layers. The original T5 architecture used `"relu"`, - but more recent versions use `"gelu"`. Defaults to `"gelu"`. + Transformer layers. Defaults to `"relu"`. use_gated_activation: boolean. Whether to use activation gating in the inner dense blocks of the Transformer layers. The original T5 architecture didn't use gating, but more @@ -74,15 +76,14 @@ def __init__( num_heads, hidden_dim, intermediate_dim, + key_value_dim=None, dropout=0.1, - activation="gelu", + activation="relu", use_gated_activation=True, layer_norm_epsilon=1e-06, - tie_embedding_weights=False, + tie_embedding_weights=True, **kwargs, ): - assert_tf_backend(self.__class__.__name__) - # Encoder inputs encoder_token_ids = keras.Input( shape=(None,), dtype="int32", name="encoder_token_ids" @@ -121,10 +122,11 @@ def __init__( position_bias = None for i in range(num_layers): - x, position_bias = T5TransformerLayer( + output = T5TransformerLayer( is_decoder=False, hidden_dim=hidden_dim, intermediate_dim=intermediate_dim, + key_value_dim=key_value_dim or hidden_dim // num_heads, dropout=dropout, activation=activation, layer_norm_epsilon=layer_norm_epsilon, @@ -138,6 +140,8 @@ def __init__( position_bias=position_bias, use_causal_mask=False, ) + if isinstance(output, tuple): + x, position_bias = output x = T5LayerNorm( epsilon=layer_norm_epsilon, @@ -162,10 +166,11 @@ def __init__( position_bias = None for i in range(num_layers): - x, position_bias = T5TransformerLayer( + output = T5TransformerLayer( is_decoder=True, hidden_dim=hidden_dim, intermediate_dim=intermediate_dim, + key_value_dim=key_value_dim or hidden_dim // num_heads, dropout=dropout, activation=activation, layer_norm_epsilon=layer_norm_epsilon, @@ -181,6 +186,8 @@ def __init__( encoder_attention_mask=encoder_attention_mask, use_causal_mask=True, ) + if isinstance(output, tuple): + x, position_bias = output x = T5LayerNorm( epsilon=layer_norm_epsilon, @@ -212,7 +219,9 @@ def __init__( self.num_layers = num_layers self.num_heads = num_heads self.activation = keras.activations.get(activation) + self.key_value_dim = key_value_dim self.dropout = dropout + self.use_gated_activation = use_gated_activation self.layer_norm_epsilon = layer_norm_epsilon self.tie_embedding_weights = tie_embedding_weights self.token_embedding = token_embedding_layer @@ -227,7 +236,9 @@ def get_config(self): "num_layers": self.num_layers, "num_heads": self.num_heads, "activation": keras.activations.serialize(self.activation), + "key_value_dim": self.key_value_dim, "dropout": self.dropout, + "use_gated_activation": self.use_gated_activation, "layer_norm_epsilon": self.layer_norm_epsilon, "tie_embedding_weights": self.tie_embedding_weights, } @@ -236,4 +247,4 @@ def get_config(self): @classproperty def presets(cls): - return {} + return copy.deepcopy(backbone_presets) diff --git a/keras_nlp/models/t5/t5_backbone_test.py b/keras_nlp/models/t5/t5_backbone_test.py index ab4270d9d9..bb672afa2c 100644 --- a/keras_nlp/models/t5/t5_backbone_test.py +++ b/keras_nlp/models/t5/t5_backbone_test.py @@ -12,128 +12,82 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import config as backend_config +from keras_nlp.backend import ops from keras_nlp.models.t5.t5_backbone import T5Backbone from keras_nlp.tests.test_case import TestCase -@pytest.mark.tf_only -class T5Test(TestCase): +class T5BackboneTest(TestCase): def setUp(self): - self.backbone = T5Backbone( - vocabulary_size=4, - num_layers=2, - num_heads=2, - hidden_dim=4, - intermediate_dim=4, - ) - self.batch_size = 2 - seq_length = 3 - self.input_batch = { - "encoder_token_ids": np.ones( - (self.batch_size, seq_length), dtype="int32" - ), - "encoder_padding_mask": np.ones( - (self.batch_size, seq_length), dtype="int32" - ), - "decoder_token_ids": np.ones( - (self.batch_size, seq_length), dtype="int32" - ), - "decoder_padding_mask": np.ones( - (self.batch_size, seq_length), dtype="int32" - ), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + } + self.input_data = { + "encoder_token_ids": ops.ones((2, 3), dtype="int32"), + "encoder_padding_mask": ops.zeros((2, 3), dtype="int32"), + "decoder_token_ids": ops.ones((2, 3), dtype="int32"), + "decoder_padding_mask": ops.zeros((2, 3), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_t5(self): - self.backbone(self.input_batch) - def test_token_embedding(self): - output = self.backbone.token_embedding( - self.input_batch["encoder_token_ids"] + def test_backbone_basics(self): + self.run_backbone_test( + cls=T5Backbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape={ + "encoder_sequence_output": (2, 3, 2), + "decoder_sequence_output": (2, 3, 2), + }, ) - self.assertEqual(output.shape, (2, 3, 4)) - def test_name(self): - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "t5_backbone") + @pytest.mark.large + def test_saved_model(self): + self.run_model_saving_test( + cls=T5Backbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - def test_variable_sequence_length_call_t5(self): - for seq_length in (2, 3, 4): - input_data = { - "encoder_token_ids": np.ones( - (self.batch_size, seq_length), dtype="int32" - ), - "encoder_padding_mask": np.ones( - (self.batch_size, seq_length), dtype="int32" + @pytest.mark.large + @pytest.mark.skipif( + not backend_config.keras_3(), + reason="TODO: Fails in Keras2", + ) + def test_smallest_preset(self): + self.run_preset_test( + cls=T5Backbone, + preset="t5_small_multi", + input_data=self.input_data, + expected_output_shape={ + "encoder_sequence_output": (2, 3, 512), + "decoder_sequence_output": (2, 3, 512), + }, + expected_partial_output={ + "encoder_sequence_output": ops.array( + [-0.0034, 0.0293, -0.0827, -0.1076] ), - "decoder_token_ids": np.ones( - (self.batch_size, seq_length), dtype="int32" + "decoder_sequence_output": ops.array( + [0.0097, 0.3576, -0.1508, 0.0150] ), - "decoder_padding_mask": np.ones( - (self.batch_size, seq_length), dtype="int32" - ), - } - outputs = self.backbone(input_data) - self.assertIn("encoder_sequence_output", outputs) - self.assertIn("decoder_sequence_output", outputs) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + }, ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) - - @pytest.mark.large # Saving is slow, so mark these large. - def test_saved_model(self): - outputs = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, T5Backbone) - - # Check that output matches. - restored_outputs = restored_model(self.input_batch) - for key in ["encoder_sequence_output", "decoder_sequence_output"]: - self.assertAllClose(outputs[key], restored_outputs[key]) - -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class T5BackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = T5Backbone( - vocabulary_size=4, - num_layers=2, - num_heads=2, - hidden_dim=4, - intermediate_dim=4, + @pytest.mark.extra_large + @pytest.mark.skipif( + not backend_config.keras_3(), + reason="TODO: Fails in Keras2", + ) + def test_all_presets(self): + for preset in T5Backbone.presets: + self.run_preset_test( + cls=T5Backbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "token_ids": np.ones((8, 4), dtype="int32"), - "padding_mask": np.ones((8, 4), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - outputs = self.backbone.predict(self.input_dataset) - self.assertIn("encoder_sequence_output", outputs) - self.assertIn("decoder_sequence_output", outputs) diff --git a/keras_nlp/models/t5/t5_layer_norm.py b/keras_nlp/models/t5/t5_layer_norm.py index 7cfdb2315e..b4f157c004 100644 --- a/keras_nlp/models/t5/t5_layer_norm.py +++ b/keras_nlp/models/t5/t5_layer_norm.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf - from keras_nlp.backend import keras +from keras_nlp.backend import ops class T5LayerNorm(keras.layers.Layer): @@ -31,8 +30,6 @@ def build(self, input_shape): self.built = True def call(self, hidden_states): - variance = tf.math.reduce_mean( - tf.math.square(hidden_states), axis=-1, keepdims=True - ) - hidden_states = hidden_states * tf.math.rsqrt(variance + self.epsilon) + variance = ops.mean(ops.square(hidden_states), axis=-1, keepdims=True) + hidden_states = hidden_states * ops.rsqrt(variance + self.epsilon) return self.weight * hidden_states diff --git a/keras_nlp/models/t5/t5_multi_head_attention.py b/keras_nlp/models/t5/t5_multi_head_attention.py index 479de51e7d..77e7109efe 100644 --- a/keras_nlp/models/t5/t5_multi_head_attention.py +++ b/keras_nlp/models/t5/t5_multi_head_attention.py @@ -12,18 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf -from tensorflow.compiler.tf2xla.python.xla import dynamic_slice +import numpy as np from keras_nlp.backend import keras - - -def shape_list(tensor): - dynamic = tf.shape(tensor) - if tensor.shape == tf.TensorShape(None): - return dynamic - static = tensor.shape.as_list() - return [dynamic[i] if s is None else s for i, s in enumerate(static)] +from keras_nlp.backend import ops class T5MultiHeadAttention(keras.layers.Layer): @@ -33,6 +25,7 @@ def __init__( self, is_decoder, hidden_dim, + key_value_dim, num_heads, dropout, use_relative_attention_bias=False, @@ -41,7 +34,7 @@ def __init__( super().__init__(**kwargs) self.is_decoder = is_decoder self.hidden_dim = hidden_dim - self.key_value_dim = hidden_dim // num_heads + self.key_value_dim = key_value_dim self.num_heads = num_heads self.use_relative_attention_bias = use_relative_attention_bias @@ -123,39 +116,39 @@ def _relative_position_bucket( if bidirectional: num_buckets //= 2 relative_buckets += ( - tf.cast( - tf.math.greater(relative_position, 0), + ops.cast( + ops.greater(relative_position, 0), dtype=relative_position.dtype, ) * num_buckets ) - relative_position = tf.math.abs(relative_position) + relative_position = ops.abs(relative_position) else: - relative_position = -tf.math.minimum(relative_position, 0) + relative_position = -ops.minimum(relative_position, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 - is_small = tf.math.less(relative_position, max_exact) - relative_position_if_large = max_exact + tf.cast( - tf.math.log( - tf.cast(relative_position, "float32") - / tf.cast(max_exact, "float32") + is_small = ops.less(relative_position, max_exact) + relative_position_if_large = max_exact + ops.cast( + ops.log( + ops.cast(relative_position, "float32") + / ops.cast(max_exact, "float32") ) - / tf.math.log(max_distance / max_exact) + / ops.cast(ops.log(max_distance / max_exact), "float32") * (num_buckets - max_exact), dtype=relative_position.dtype, ) - relative_position_if_large = tf.math.minimum( + relative_position_if_large = ops.minimum( relative_position_if_large, num_buckets - 1 ) - relative_buckets += tf.where( + relative_buckets += ops.where( is_small, relative_position, relative_position_if_large ) return relative_buckets def compute_bias(self, query_length, key_length): """Compute binned relative position bias""" - context_position = tf.range(query_length)[:, None] - memory_position = tf.range(key_length)[None, :] + context_position = ops.arange(query_length)[:, None] + memory_position = ops.arange(key_length)[None, :] relative_position = ( memory_position - context_position ) # shape (query_length, key_length) @@ -165,11 +158,11 @@ def compute_bias(self, query_length, key_length): num_buckets=self.relative_attention_buckets, max_distance=self.relative_attention_max_distance, ) - values = tf.gather( - self.relative_attention_bias, relative_position_bucket + values = ops.take( + self.relative_attention_bias, relative_position_bucket, axis=0 ) # shape (query_length, key_length, num_heads) - values = tf.expand_dims( - tf.transpose(values, [2, 0, 1]), axis=0 + values = ops.expand_dims( + ops.transpose(values, axes=(2, 0, 1)), axis=0 ) # shape (1, num_heads, query_length, key_length) return values @@ -186,7 +179,7 @@ def call( ): # Input is (batch_size, query_length, dim) # past_key_value[0] is (batch_size, num_heads, q_len - 1, dim_per_head) - batch_size, seq_length = shape_list(hidden_states)[:2] + batch_size, seq_length = ops.shape(hidden_states)[:2] real_seq_length = seq_length @@ -197,7 +190,7 @@ def call( f"keys and values. Got {len(past_key_value)} past states." ) real_seq_length += ( - shape_list(past_key_value[0])[2] + ops.shape(past_key_value[0])[2] if query_length is None else query_length ) @@ -205,21 +198,21 @@ def call( key_length = ( real_seq_length if key_value_states is None - else shape_list(key_value_states)[1] + else ops.shape(key_value_states)[1] ) def shape(hidden_states): - return tf.transpose( - tf.reshape( + return ops.transpose( + ops.reshape( hidden_states, (batch_size, -1, self.num_heads, self.key_value_dim), ), - perm=(0, 2, 1, 3), + axes=(0, 2, 1, 3), ) def unshape(hidden_states): - return tf.reshape( - tf.transpose(hidden_states, perm=(0, 2, 1, 3)), + return ops.reshape( + ops.transpose(hidden_states, axes=(0, 2, 1, 3)), (batch_size, -1, self.inner_dim), ) @@ -240,7 +233,7 @@ def project( if key_value_states is None: # self-attention # (batch_size, num_heads, key_length, dim_per_head) - hidden_states = tf.concat( + hidden_states = ops.concat( [past_key_value, hidden_states], axis=2 ) else: @@ -267,13 +260,13 @@ def project( past_key_value[1] if past_key_value is not None else None, ) - scores = tf.einsum( + scores = ops.einsum( "bnqd,bnkd->bnqk", query_states, key_states ) # (batch_size, num_heads, query_length, key_length) if position_bias is None: if not self.use_relative_attention_bias: - position_bias = tf.zeros( + position_bias = ops.zeros( (1, self.num_heads, real_seq_length, key_length), self.compute_dtype, ) @@ -289,10 +282,10 @@ def project( # we might have a padded past structure, # in which case we want to fetch the position bias slice # right after the most recently filled past index - most_recently_filled_past_index = tf.reduce_max( - tf.where(past_key_value[0][0, 0, :, 0] != 0.0) + most_recently_filled_past_index = ops.amax( + ops.where(past_key_value[0][0, 0, :, 0] != 0.0) ) - position_bias = dynamic_slice( + position_bias = ops.slice( position_bias, (0, 0, most_recently_filled_past_index + 1, 0), (1, self.num_heads, seq_length, real_seq_length), @@ -300,13 +293,13 @@ def project( if mask is not None: # Add a new mask axis for the head dim. - mask = mask[:, tf.newaxis, :, :] + mask = mask[:, np.newaxis, :, :] # Add a very large negative position bias for masked positions. - mask = (1.0 - tf.cast(mask, position_bias.dtype)) * -1e9 + mask = (1.0 - ops.cast(mask, position_bias.dtype)) * -1e9 position_bias = position_bias + mask scores += position_bias - weights = tf.nn.softmax( + weights = ops.nn.softmax( scores, axis=-1 ) # (batch_size, num_heads, query_length, key_length) weights = self.dropout_layer( @@ -315,9 +308,9 @@ def project( # Optionally mask heads if layer_head_mask is not None: - weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * weights + weights = ops.reshape(layer_head_mask, (1, -1, 1, 1)) * weights - attention_output = tf.matmul( + attention_output = ops.matmul( weights, value_states ) # (batch_size, num_heads, query_length, dim_per_head) diff --git a/keras_nlp/models/t5/t5_presets.py b/keras_nlp/models/t5/t5_presets.py new file mode 100644 index 0000000000..58b301b7f0 --- /dev/null +++ b/keras_nlp/models/t5/t5_presets.py @@ -0,0 +1,95 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""XLM-RoBERTa model preset configurations.""" + +backbone_presets = { + "t5_small_multi": { + "metadata": { + "description": ( + "8-layer T5 model. Trained on the Colossal Clean Crawled " + "Corpus (C4)." + ), + "params": 0, + "official_name": "T5", + "path": "t5", + "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md", + }, + "kaggle_handle": "kaggle://keras/t5/keras/t5_small_multi/2", + }, + "t5_base_multi": { + "metadata": { + "description": ( + "12-layer T5 model. Trained on the Colossal Clean Crawled " + "Corpus (C4)." + ), + "params": 0, + "official_name": "T5", + "path": "t5", + "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md", + }, + "kaggle_handle": "kaggle://keras/t5/keras/t5_base_multi/2", + }, + "t5_large_multi": { + "metadata": { + "description": ( + "24-layer T5 model. Trained on the Colossal Clean Crawled " + "Corpus (C4)." + ), + "params": 0, + "official_name": "T5", + "path": "t5", + "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md", + }, + "kaggle_handle": "kaggle://keras/t5/keras/t5_large_multi/2", + }, + "flan_small_multi": { + "metadata": { + "description": ( + "8-layer T5 model. Trained on the Colossal Clean Crawled " + "Corpus (C4)." + ), + "params": 0, + "official_name": "T5", + "path": "t5", + "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md", + }, + "kaggle_handle": "kaggle://keras/t5/keras/flan_small_multi/2", + }, + "flan_base_multi": { + "metadata": { + "description": ( + "12-layer T5 model. Trained on the Colossal Clean Crawled " + "Corpus (C4)." + ), + "params": 0, + "official_name": "T5", + "path": "t5", + "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md", + }, + "kaggle_handle": "kaggle://keras/t5/keras/flan_base_multi/2", + }, + "flan_large_multi": { + "metadata": { + "description": ( + "24-layer T5 model. Trained on the Colossal Clean Crawled " + "Corpus (C4)." + ), + "params": 0, + "official_name": "T5", + "path": "t5", + "model_card": "https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.md", + }, + "kaggle_handle": "kaggle://keras/t5/keras/flan_large_multi/2", + }, +} diff --git a/keras_nlp/models/t5/t5_tokenizer.py b/keras_nlp/models/t5/t5_tokenizer.py index ae9facb318..5feb2d9ab8 100644 --- a/keras_nlp/models/t5/t5_tokenizer.py +++ b/keras_nlp/models/t5/t5_tokenizer.py @@ -11,12 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy -from keras_nlp.api_export import keras_nlp_export +from keras_nlp.backend import keras +from keras_nlp.models.t5.t5_presets import backbone_presets from keras_nlp.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer +from keras_nlp.utils.python_utils import classproperty -@keras_nlp_export("keras_nlp.models.T5Tokenizer") +@keras.saving.register_keras_serializable(package="keras_nlp") class T5Tokenizer(SentencePieceTokenizer): """T5 tokenizer layer based on SentencePiece. @@ -73,20 +76,30 @@ class T5Tokenizer(SentencePieceTokenizer): """ def __init__(self, proto, **kwargs): + self.end_token = "" + self.pad_token = "" + super().__init__(proto=proto, **kwargs) - # Check for necessary special tokens. - end_token = "" - pad_token = "" - for token in [pad_token]: - if token not in self.get_vocabulary(): - raise ValueError( - f"Cannot find token `'{token}'` in the provided " - f"`vocabulary`. Please provide `'{token}'` in your " - "`vocabulary` or use a pretrained `vocabulary` name." - ) - - self.pad_token_id = self.token_to_id(pad_token) - self.end_token_id = self.token_to_id(end_token) - # T5 uses the same start token as end token, i.e., "<\s>". - self.start_token_id = self.end_token_id + def set_proto(self, proto): + super().set_proto(proto) + if proto is not None: + for token in [self.end_token, self.pad_token]: + if token not in self.get_vocabulary(): + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + self.end_token_id = self.token_to_id(self.end_token) + self.pad_token_id = self.token_to_id(self.pad_token) + # T5 uses the same start token as end token, i.e., "<\s>". + self.start_token_id = self.end_token_id + else: + self.end_token_id = None + self.pad_token_id = None + self.start_token_id = None + + @classproperty + def presets(cls): + return copy.deepcopy(backbone_presets) diff --git a/keras_nlp/models/t5/t5_tokenizer_test.py b/keras_nlp/models/t5/t5_tokenizer_test.py index ce16492acd..a7558e8b13 100644 --- a/keras_nlp/models/t5/t5_tokenizer_test.py +++ b/keras_nlp/models/t5/t5_tokenizer_test.py @@ -12,76 +12,54 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.t5.t5_tokenizer import T5Tokenizer from keras_nlp.tests.test_case import TestCase class T5TokenizerTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=11, - model_type="WORD", - bos_id=-1, - pad_id=0, - eos_id=1, - unk_id=2, - pad_piece="", - eos_piece="", - unk_piece="", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - - self.tokenizer = T5Tokenizer(proto=self.proto) - - def test_tokenize(self): - input_data = "the quick brown fox" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [4, 9, 5, 7]) - - def test_tokenize_batch(self): - input_data = ["the quick brown fox", "the earth is round"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[4, 9, 5, 7], [4, 6, 8, 10]]) + self.init_kwargs = { + # Generated using create_t5_test_proto.py + "proto": os.path.join(self.get_test_data_dir(), "t5_test_vocab.spm") + } + self.input_data = ["the quick brown fox", "the earth is round"] - def test_detokenize(self): - input_data = [[4, 9, 5, 7]] - output = self.tokenizer.detokenize(input_data) - self.assertEqual(output, ["the quick brown fox"]) - - def test_vocabulary_size(self): - tokenizer = T5Tokenizer(proto=self.proto) - self.assertEqual(tokenizer.vocabulary_size(), 11) + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=T5Tokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[4, 9, 5, 7], [4, 6, 8, 10]], + ) def test_errors_missing_special_tokens(self): - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(["abc"]), - model_writer=bytes_io, - vocab_size=5, - pad_id=-1, - eos_id=-1, - bos_id=-1, - ) with self.assertRaises(ValueError): - T5Tokenizer(proto=bytes_io.getvalue()) + T5Tokenizer( + # Generated using create_no_special_token_proto.py + proto=os.path.join( + self.get_test_data_dir(), "no_special_token_vocab.spm" + ) + ) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), - ) + @pytest.mark.large + def test_smallest_preset(self): + for preset in T5Tokenizer.presets: + self.run_preset_test( + cls=T5Tokenizer, + preset=preset, + input_data=["The quick brown fox."], + expected_output=[[37, 1704, 4216, 3, 20400, 5]], + ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in T5Tokenizer.presets: + self.run_preset_test( + cls=T5Tokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/t5/t5_transformer_layer.py b/keras_nlp/models/t5/t5_transformer_layer.py index ce4a28d67f..27b4c9892c 100644 --- a/keras_nlp/models/t5/t5_transformer_layer.py +++ b/keras_nlp/models/t5/t5_transformer_layer.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf - from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.layers.modeling.transformer_layer_utils import ( compute_causal_mask, ) @@ -28,6 +27,7 @@ def __init__( is_decoder, hidden_dim, intermediate_dim, + key_value_dim, dropout, activation, layer_norm_epsilon, @@ -41,10 +41,11 @@ def __init__( self.use_gated_activation = use_gated_activation self.self_attention = T5MultiHeadAttention( - is_decoder, - hidden_dim, - num_heads, - dropout, + is_decoder=is_decoder, + hidden_dim=hidden_dim, + key_value_dim=key_value_dim, + num_heads=num_heads, + dropout=dropout, use_relative_attention_bias=use_relative_attention_bias, name="self_attention", ) @@ -53,10 +54,11 @@ def __init__( if self.is_decoder: self.cross_attention = T5MultiHeadAttention( - is_decoder, - hidden_dim, - num_heads, - dropout, + is_decoder=is_decoder, + hidden_dim=hidden_dim, + key_value_dim=key_value_dim, + num_heads=num_heads, + dropout=dropout, use_relative_attention_bias=False, name="cross_attention", ) @@ -103,10 +105,10 @@ def call( training=False, ): if use_causal_mask: - shape = tf.shape(hidden_states) + shape = ops.shape(hidden_states) batch_size, length = shape[0], shape[1] causal_mask = compute_causal_mask(batch_size, length, length) - attention_mask = tf.cast(attention_mask, "int32") + attention_mask = ops.cast(attention_mask, "int32") attention_mask = causal_mask & attention_mask x = hidden_states # Intermediate result. @@ -147,4 +149,7 @@ def call( x = self.dropout_layer(x, training=training) x = x + residual - return x, position_bias + if position_bias is not None: + return x, position_bias + else: + return x diff --git a/keras_nlp/models/task.py b/keras_nlp/models/task.py index 5d9a605449..ee28e3a984 100644 --- a/keras_nlp/models/task.py +++ b/keras_nlp/models/task.py @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import keras_core from rich import console as rich_console from rich import markup from rich import table as rich_table +from keras_nlp.backend import config from keras_nlp.backend import keras from keras_nlp.utils.keras_utils import print_msg from keras_nlp.utils.pipeline_model import PipelineModel +from keras_nlp.utils.preset_utils import check_preset_class +from keras_nlp.utils.preset_utils import load_from_preset from keras_nlp.utils.python_utils import classproperty from keras_nlp.utils.python_utils import format_docstring @@ -34,6 +34,19 @@ def __init__(self, *args, **kwargs): self._backbone = None self._preprocessor = None super().__init__(*args, **kwargs) + self._functional_layer_ids = set( + id(layer) for layer in self._flatten_layers() + ) + + def __dir__(self): + # Temporary fixes for weight saving. This mimics the following PR for + # older version of Keras: https://github.com/keras-team/keras/pull/18982 + def filter_fn(attr): + if attr == "_layer_checkpoint_dependencies": + return False + return id(getattr(self, attr)) not in self._functional_layer_ids + + return filter(filter_fn, super().__dir__()) def _check_for_loss_mismatch(self, loss): """Check for a softmax/from_logits mismatch after compile. @@ -79,6 +92,9 @@ def _check_for_loss_mismatch(self, loss): ) def compile(self, optimizer="rmsprop", loss=None, **kwargs): + # Temporarily disable jit compilation on torch. + if config.backend() == "torch": + kwargs["jit_compile"] = False self._check_for_loss_mismatch(loss) super().compile(optimizer=optimizer, loss=loss, **kwargs) @@ -171,43 +187,43 @@ def from_preset( ) ``` """ - if not cls.presets: - raise NotImplementedError( - "No presets have been created for this class." - ) - - if preset not in cls.presets: + if "backbone" in kwargs: raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" + "You cannot pass a `backbone` argument to the `from_preset` " + f"method. Instead, call the {cls.__name__} default " + "constructor with a `backbone` argument. " + f"Received: backbone={kwargs['backbone']}." ) - - if "preprocessor" not in kwargs: - kwargs["preprocessor"] = cls.preprocessor_cls.from_preset(preset) - - # Check if preset is backbone-only model - if preset in cls.backbone_cls.presets: - backbone = cls.backbone_cls.from_preset(preset, load_weights) - return cls(backbone, **kwargs) - - # Otherwise must be one of class presets - metadata = cls.presets[preset] - config = metadata["config"] - model = cls.from_config({**config, **kwargs}) - - if not load_weights: - return model - - weights = keras.utils.get_file( - "model.h5", - metadata["weights_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["weights_hash"], + # We support short IDs for official presets, e.g. `"bert_base_en"`. + # Map these to a Kaggle Models handle. + if preset in cls.presets: + preset = cls.presets[preset]["kaggle_handle"] + + preset_cls = check_preset_class(preset, (cls, cls.backbone_cls)) + + # Backbone case. + if preset_cls == cls.backbone_cls: + backbone = load_from_preset( + preset, + load_weights=load_weights, + ) + if "preprocessor" in kwargs: + preprocessor = kwargs.pop("preprocessor") + else: + tokenizer = load_from_preset( + preset, + config_file="tokenizer.json", + ) + preprocessor = cls.preprocessor_cls(tokenizer=tokenizer) + return cls(backbone=backbone, preprocessor=preprocessor, **kwargs) + + # Task case. + return load_from_preset( + preset, + load_weights=load_weights, + config_overrides=kwargs, ) - model.load_weights(weights) - return model - def __init_subclass__(cls, **kwargs): # Use __init_subclass__ to setup a correct docstring for from_preset. super().__init_subclass__(**kwargs) @@ -315,11 +331,21 @@ def bold_text(x): if print_fn: print_fn(console.end_capture(), line_break=False) - # Hardcode summary from keras_core for now. - keras_core.Model.summary( - self, - line_length=line_length, - positions=positions, - print_fn=print_fn, - **kwargs, - ) + # Avoid `tf.keras.Model.summary()`, so the above output matches. + if config.keras_3(): + super().summary( + line_length=line_length, + positions=positions, + print_fn=print_fn, + **kwargs, + ) + else: + import keras_core + + keras_core.Model.summary( + self, + line_length=line_length, + positions=positions, + print_fn=print_fn, + **kwargs, + ) diff --git a/keras_nlp/models/whisper/whisper_audio_feature_extractor.py b/keras_nlp/models/whisper/whisper_audio_feature_extractor.py index 73b15a3afd..5fade1d63b 100644 --- a/keras_nlp/models/whisper/whisper_audio_feature_extractor.py +++ b/keras_nlp/models/whisper/whisper_audio_feature_extractor.py @@ -17,15 +17,17 @@ import numpy as np import tensorflow as tf -from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras +from keras_nlp.layers.preprocessing.preprocessing_layer import ( + PreprocessingLayer, +) from keras_nlp.models.whisper.whisper_presets import backbone_presets from keras_nlp.utils.python_utils import classproperty from keras_nlp.utils.python_utils import format_docstring -@keras_nlp_export("keras_nlp.models.WhisperAudioFeatureExtractor") -class WhisperAudioFeatureExtractor(keras.layers.Layer): +@keras.saving.register_keras_serializable(package="keras_nlp") +class WhisperAudioFeatureExtractor(PreprocessingLayer): """ Whisper audio feature extractor layer. @@ -163,9 +165,10 @@ def _get_mel_filters(self): weights *= enorm[:, np.newaxis] weights = np.transpose(weights) - return tf.constant(weights, dtype=self.dtype) + return tf.constant(weights, dtype=self.compute_dtype) def _extract_audio_features(self, audio): + audio = tf.cast(audio, self.compute_dtype) # Use "reflection" padding - `tf.signal.stft` uses symmetric padding # internally. audio = tf.pad( @@ -246,6 +249,8 @@ def call(self, audio): # Find the log mel spectrogram. log_spec = self._extract_audio_features(audio) + if rank_1_input: + log_spec = tf.squeeze(log_spec, 0) return log_spec def get_config(self): diff --git a/keras_nlp/models/whisper/whisper_audio_feature_extractor_test.py b/keras_nlp/models/whisper/whisper_audio_feature_extractor_test.py index 8eca8f9c79..ff3178950e 100644 --- a/keras_nlp/models/whisper/whisper_audio_feature_extractor_test.py +++ b/keras_nlp/models/whisper/whisper_audio_feature_extractor_test.py @@ -14,7 +14,6 @@ import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.whisper.whisper_audio_feature_extractor import ( WhisperAudioFeatureExtractor, ) @@ -23,53 +22,33 @@ class WhisperAudioFeatureExtractorTest(TestCase): def setUp(self): - self.num_mels = 80 - self.num_fft_bins = 400 - self.stride = 100 - self.sampling_rate = 100 - self.max_audio_length = 5 - self.audio_feature_extractor = WhisperAudioFeatureExtractor( - num_mels=self.num_mels, - num_fft_bins=self.num_fft_bins, - stride=self.stride, - sampling_rate=self.sampling_rate, - max_audio_length=self.max_audio_length, + self.init_kwargs = { + "num_mels": 80, + "num_fft_bins": 400, + "stride": 100, + "sampling_rate": 100, + "max_audio_length": 5, + } + audio_tensor_1 = tf.ones((2,), dtype="float32") + audio_tensor_2 = tf.ones((25,), dtype="float32") + self.input_data = tf.ragged.stack( + [audio_tensor_1, audio_tensor_2], + axis=0, ) - def test_unbatched_inputs(self): - audio_tensor = tf.ones((2,), dtype="float32") + def test_feature_extractor_basics(self): + self.run_preprocessing_layer_test( + cls=WhisperAudioFeatureExtractor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - outputs = self.audio_feature_extractor(audio_tensor) + def test_correctness(self): + audio_tensor = tf.ones((2,), dtype="float32") + outputs = WhisperAudioFeatureExtractor(**self.init_kwargs)(audio_tensor) # Verify shape. - self.assertEqual(outputs.shape, (1, 5, self.num_mels)) + self.assertEqual(outputs.shape, (5, 80)) # Verify output. expected = [1.1656, 1.0151, -0.8343, -0.8343, -0.8343] - self.assertAllClose(outputs[0, :, 0], expected, atol=0.01, rtol=0.01) - - def test_batched_inputs(self): - audio_tensor_1 = tf.ones((2,), dtype="float32") - audio_tensor_2 = tf.ones((25,), dtype="float32") - audio_tensor = tf.ragged.stack([audio_tensor_1, audio_tensor_2], axis=0) - - outputs = self.audio_feature_extractor(audio_tensor) - - # Verify shape. - self.assertEqual(outputs.shape, (2, 5, self.num_mels)) - # Verify output. - expected_1 = [1.1656, 1.0151, -0.8343, -0.8343, -0.8343] - self.assertAllClose(outputs[0, :, 0], expected_1, atol=0.01, rtol=0.01) - expected_2 = [1.2299, 1.0970, 0.3997, -0.7700, -0.7700] - self.assertAllClose(outputs[1, :, 0], expected_2, atol=0.01, rtol=0.01) - - def test_serialization(self): - config = keras.saving.serialize_keras_object( - self.audio_feature_extractor - ) - new_audio_feature_extractor = keras.saving.deserialize_keras_object( - config - ) - self.assertEqual( - new_audio_feature_extractor.get_config(), - self.audio_feature_extractor.get_config(), - ) + self.assertAllClose(outputs[:, 0], expected, atol=0.01, rtol=0.01) diff --git a/keras_nlp/models/whisper/whisper_backbone.py b/keras_nlp/models/whisper/whisper_backbone.py index 87c94ae7fd..c66fcc8089 100644 --- a/keras_nlp/models/whisper/whisper_backbone.py +++ b/keras_nlp/models/whisper/whisper_backbone.py @@ -14,7 +14,6 @@ import copy -from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.layers.modeling.position_embedding import PositionEmbedding @@ -38,7 +37,7 @@ def call(self, x): return ops.pad(x, [[0, 0], [1, 1], [0, 0]]) -@keras_nlp_export("keras_nlp.models.WhisperBackbone") +@keras.saving.register_keras_serializable(package="keras_nlp") class WhisperBackbone(Backbone): """A Whisper encoder-decoder network for speech. @@ -187,9 +186,7 @@ def __init__( x = WhisperEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, - activation=lambda x: keras.activations.gelu( - x, approximate=False - ), + activation=keras.activations.gelu, layer_norm_epsilon=1e-5, dropout=dropout, kernel_initializer=whisper_kernel_initializer(), @@ -229,9 +226,7 @@ def __init__( intermediate_dim=intermediate_dim, num_heads=num_heads, dropout=dropout, - activation=lambda x: keras.activations.gelu( - x, approximate=False - ), + activation=keras.activations.gelu, layer_norm_epsilon=1e-5, kernel_initializer=whisper_kernel_initializer(), normalize_first=True, diff --git a/keras_nlp/models/whisper/whisper_backbone_test.py b/keras_nlp/models/whisper/whisper_backbone_test.py index e3266ea9d5..63d57615af 100644 --- a/keras_nlp/models/whisper/whisper_backbone_test.py +++ b/keras_nlp/models/whisper/whisper_backbone_test.py @@ -12,13 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.models.whisper.whisper_backbone import WhisperBackbone from keras_nlp.tests.test_case import TestCase @@ -26,135 +22,93 @@ @pytest.mark.tf_only class WhisperBackboneTest(TestCase): def setUp(self): - self.backbone = WhisperBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_encoder_sequence_length=6, - max_decoder_sequence_length=6, - ) - self.input_batch = { - "encoder_features": np.ones((2, 5, 80), dtype="float32"), - "decoder_token_ids": np.ones((2, 5), dtype="int32"), - "decoder_padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_encoder_sequence_length": 6, + "max_decoder_sequence_length": 6, + } + self.input_data = { + "encoder_features": ops.ones((2, 5, 80), dtype="float32"), + "decoder_token_ids": ops.ones((2, 5), dtype="int32"), + "decoder_padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_whisper(self): - self.backbone(self.input_batch) - - def test_token_embedding(self): - output = self.backbone.token_embedding( - self.input_batch["decoder_token_ids"] - ) - self.assertEqual(output.shape, (2, 5, 2)) - - def test_name(self): - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "whisper_backbone") - - def test_variable_sequence_length_call_whisper(self): - for seq_length in (2, 3, 4): - input_data = { - "encoder_features": np.ones( - (2, seq_length, 80), dtype="float32" - ), - "decoder_token_ids": np.ones((2, seq_length), dtype="int32"), - "decoder_padding_mask": np.ones((2, seq_length), dtype="int32"), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=WhisperBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape={ + "encoder_sequence_output": (2, 3, 2), + "decoder_sequence_output": (2, 5, 2), + }, ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) def test_key_projection_bias_absence(self): + backbone = WhisperBackbone(**self.init_kwargs) # Check only for the first encoder layer and first decoder layer. self.assertIsNone( - self.backbone.get_layer( + backbone.get_layer( "transformer_encoder_layer_0" )._self_attention_layer._key_dense.bias ) self.assertIsNone( - self.backbone.get_layer( + backbone.get_layer( "transformer_decoder_layer_0" )._self_attention_layer._key_dense.bias ) self.assertIsNone( - self.backbone.get_layer( + backbone.get_layer( "transformer_decoder_layer_0" )._cross_attention_layer._key_dense.bias ) - @pytest.mark.large # Saving is slow, so mark these large. + @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, WhisperBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose( - model_output["encoder_sequence_output"], - restored_output["encoder_sequence_output"], + self.run_model_saving_test( + cls=WhisperBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, ) - self.assertAllClose( - model_output["decoder_sequence_output"], - restored_output["decoder_sequence_output"], - ) - - -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class WhisperBackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = WhisperBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_encoder_sequence_length=6, - max_decoder_sequence_length=6, - ) - self.input_batch = { - "encoder_features": np.ones( - ( - 8, - self.backbone.max_encoder_sequence_length, - 80, + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=WhisperBackbone, + preset="whisper_tiny_en", + input_data={ + "encoder_features": ops.ones((1, 3000, 80)), + "decoder_token_ids": ops.array( + [[50257, 50362, 464, 2068, 7586, 21831, 13, 50256, 50256]] ), - dtype="int32", - ), - "decoder_token_ids": np.ones( - (8, self.backbone.max_decoder_sequence_length), dtype="int32" - ), - "decoder_padding_mask": np.ones( - (8, self.backbone.max_decoder_sequence_length), dtype="int32" - ), - } - - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) + "decoder_padding_mask": ops.array( + [[1, 1, 1, 1, 1, 1, 1, 1, 0]] + ), + }, + expected_output_shape={ + "encoder_sequence_output": (1, 1500, 384), + "decoder_sequence_output": (1, 9, 384), + }, + # The forward pass from a preset should be stable! + expected_partial_output={ + "encoder_sequence_output": ops.array( + [-0.21382, -0.48528, 0.42348, -1.33874, -0.14191] + ), + "decoder_sequence_output": ops.array( + [13.238, 1.051, 8.348, -20.012, -5.022] + ), + }, + ) - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in WhisperBackbone.presets: + self.run_preset_test( + cls=WhisperBackbone, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/whisper/whisper_cached_multi_head_attention.py b/keras_nlp/models/whisper/whisper_cached_multi_head_attention.py new file mode 100644 index 0000000000..01ad18ba4b --- /dev/null +++ b/keras_nlp/models/whisper/whisper_cached_multi_head_attention.py @@ -0,0 +1,155 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Whisper Cached Multi-Head Attention layer.""" + +import collections +import string + +import keras_nlp +from keras_nlp.backend import keras + + +def _index_to_einsum_variable(i): + """Converts an index to a einsum variable name. + + We simply map indices to lowercase characters, e.g. 0 -> 'a', 1 -> 'b'. + """ + return string.ascii_lowercase[i] + + +def _build_proj_equation(free_dims, bound_dims, output_dims): + """Builds an einsum equation for projections inside multi-head attention.""" + input_str = "" + kernel_str = "" + output_str = "" + bias_axes = "" + letter_offset = 0 + for i in range(free_dims): + char = _index_to_einsum_variable(i + letter_offset) + input_str += char + output_str += char + + letter_offset += free_dims + for i in range(bound_dims): + char = _index_to_einsum_variable(i + letter_offset) + input_str += char + kernel_str += char + + letter_offset += bound_dims + for i in range(output_dims): + char = _index_to_einsum_variable(i + letter_offset) + kernel_str += char + output_str += char + bias_axes += char + equation = f"{input_str},{kernel_str}->{output_str}" + + return equation, bias_axes, len(output_str) + + +def _get_output_shape(output_rank, known_last_dims): + return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims) + + +@keras.saving.register_keras_serializable(package="keras_nlp") +class WhisperCachedMultiHeadAttention( + keras_nlp.layers.CachedMultiHeadAttention +): + """Whisper Cached Multi-Head Attention layer. + + Inherits from `keras_nlp.layers.CachedMultiHeadAttention`, and overrides the + `build` method so that Q, V projection layers have bias + whereas K projection layer does not. + """ + + def build( + self, + query_shape, + value_shape, + key_shape=None, + ): + key_shape = value_shape if key_shape is None else key_shape + query_rank = len(query_shape) + value_rank = len(value_shape) + key_rank = len(key_shape) + einsum_equation, bias_axes, output_rank = _build_proj_equation( + query_rank - 1, bound_dims=1, output_dims=2 + ) + self._query_dense = keras.layers.EinsumDense( + einsum_equation, + output_shape=_get_output_shape( + output_rank - 1, [self._num_heads, self._key_dim] + ), + bias_axes=bias_axes if self._use_bias else None, + name="query", + **self._get_common_kwargs_for_sublayer(), + ) + self._query_dense.build(query_shape) + einsum_equation, bias_axes, output_rank = _build_proj_equation( + key_rank - 1, bound_dims=1, output_dims=2 + ) + self._key_dense = keras.layers.EinsumDense( + einsum_equation, + output_shape=_get_output_shape( + output_rank - 1, [self._num_heads, self._key_dim] + ), + bias_axes=None, + name="key", + **self._get_common_kwargs_for_sublayer(), + ) + self._key_dense.build(key_shape) + einsum_equation, bias_axes, output_rank = _build_proj_equation( + value_rank - 1, bound_dims=1, output_dims=2 + ) + self._value_dense = keras.layers.EinsumDense( + einsum_equation, + output_shape=_get_output_shape( + output_rank - 1, [self._num_heads, self._value_dim] + ), + bias_axes=bias_axes if self._use_bias else None, + name="value", + **self._get_common_kwargs_for_sublayer(), + ) + self._value_dense.build(value_shape) + + # Builds the attention computations for multi-head dot product + # attention. These computations could be wrapped into the keras + # attention layer once it supports multi-head einsum computations. + self._build_attention(output_rank) + + if self._output_shape: + if not isinstance(self._output_shape, collections.abc.Sized): + output_shape = [self._output_shape] + else: + output_shape = self._output_shape + else: + output_shape = [query_shape[-1]] + einsum_equation, bias_axes, output_rank = _build_proj_equation( + query_rank - 1, bound_dims=2, output_dims=len(output_shape) + ) + self._output_dense = keras.layers.EinsumDense( + einsum_equation, + output_shape=_get_output_shape(output_rank - 1, output_shape), + bias_axes=bias_axes if self._use_bias else None, + name="attention_output", + **self._get_common_kwargs_for_sublayer(), + ) + output_dense_input_shape = list( + self._query_dense.compute_output_shape(query_shape) + ) + output_dense_input_shape[-1] = self._value_dim + self._output_dense.build(tuple(output_dense_input_shape)) + self.built = True + + def _build_from_signature(self, query, value, key=None): + pass diff --git a/keras_nlp/models/whisper/whisper_decoder.py b/keras_nlp/models/whisper/whisper_decoder.py index 7f5d834741..c41a870a42 100644 --- a/keras_nlp/models/whisper/whisper_decoder.py +++ b/keras_nlp/models/whisper/whisper_decoder.py @@ -11,33 +11,130 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Whisper decoder block.""" + from keras_nlp.backend import keras from keras_nlp.layers.modeling.transformer_decoder import TransformerDecoder +from keras_nlp.models.whisper.whisper_cached_multi_head_attention import ( + WhisperCachedMultiHeadAttention, +) +from keras_nlp.utils.keras_utils import clone_initializer @keras.saving.register_keras_serializable(package="keras_nlp") class WhisperDecoder(TransformerDecoder): - """A Whisper decoder. + """Whisper decoder. Inherits from `keras_nlp.layers.TransformerDecoder`, and overrides the - `build` method so as to remove the bias term from the key projection layer. + `build` method to use the + `keras_nlp.models.whisper.whisper_multi_head_attention.WhisperMultiHeadAttention` + layer instead of `keras.layers.MultiHeadAttention` and + `keras_nlp.models.whisper.whisper_cached_multi_head_attention.WhisperCachedMultiHeadAttention` + instead of `keras_nlp.layers.cached_multi_head_attention.CachedMultiHeadAttention`. """ def build( self, decoder_sequence_shape, - encoder_sequence_shape=None, + encoder_sequence_shape, ): - super().build( - decoder_sequence_shape, - encoder_sequence_shape=encoder_sequence_shape, - ) - - # Since there is no exposed option for this in MHA, we will reach into - # the internals of the layer for now. - self._self_attention_layer._key_dense.bias_axes = None - self._self_attention_layer._key_dense.bias = None - if self._cross_attention_layer: - self._cross_attention_layer._key_dense.bias_axes = None - self._cross_attention_layer._key_dense.bias = None + self._decoder_sequence_shape = decoder_sequence_shape + self._encoder_sequence_shape = encoder_sequence_shape + # Infer the dimension of our hidden feature size from the build shape. + hidden_dim = decoder_sequence_shape[-1] + # Attention head size is `hidden_dim` over the number of heads. + head_dim = int(hidden_dim // self.num_heads) + if head_dim == 0: + raise ValueError( + "Attention `head_dim` computed cannot be zero. " + f"The `hidden_dim` value of {hidden_dim} has to be equal to " + f"or greater than `num_heads` value of {self.num_heads}." + ) + + # Self attention layers. + self._self_attention_layer = WhisperCachedMultiHeadAttention( + num_heads=self.num_heads, + key_dim=head_dim, + dropout=self.dropout, + kernel_initializer=clone_initializer(self.kernel_initializer), + bias_initializer=clone_initializer(self.bias_initializer), + dtype=self.dtype_policy, + name="self_attention", + ) + + self._self_attention_layer.build( + query_shape=decoder_sequence_shape, + value_shape=decoder_sequence_shape, + ) + self._self_attention_layer_norm = keras.layers.LayerNormalization( + epsilon=self.layer_norm_epsilon, + dtype=self.dtype_policy, + name="self_attention_layer_norm", + ) + self._self_attention_layer_norm.build(decoder_sequence_shape) + self._self_attention_dropout = keras.layers.Dropout( + rate=self.dropout, + dtype=self.dtype_policy, + name="self_attention_dropout", + ) + + self._cross_attention_layer = WhisperCachedMultiHeadAttention( + num_heads=self.num_heads, + key_dim=head_dim, + value_dim=head_dim, + dropout=self.dropout, + kernel_initializer=clone_initializer(self.kernel_initializer), + bias_initializer=clone_initializer(self.bias_initializer), + dtype=self.dtype_policy, + name="cross_attention", + ) + self._cross_attention_layer.build( + query_shape=decoder_sequence_shape, + value_shape=encoder_sequence_shape, + ) + self._cross_attention_layer_norm = keras.layers.LayerNormalization( + epsilon=self.layer_norm_epsilon, + dtype=self.dtype_policy, + name="cross_attention_layer_norm", + ) + self._cross_attention_layer_norm.build(decoder_sequence_shape) + self._cross_attention_dropout = keras.layers.Dropout( + rate=self.dropout, + dtype=self.dtype_policy, + name="cross_attention_dropout", + ) + + # Feedforward layers. + self._feedforward_intermediate_dense = keras.layers.Dense( + self.intermediate_dim, + activation=self.activation, + kernel_initializer=clone_initializer(self.kernel_initializer), + bias_initializer=clone_initializer(self.bias_initializer), + dtype=self.dtype_policy, + name="feedforward_intermediate_dense", + ) + self._feedforward_intermediate_dense.build(decoder_sequence_shape) + self._feedforward_output_dense = keras.layers.Dense( + hidden_dim, + kernel_initializer=clone_initializer(self.kernel_initializer), + bias_initializer=clone_initializer(self.bias_initializer), + dtype=self.dtype_policy, + name="feedforward_output_dense", + ) + intermediate_shape = list(decoder_sequence_shape) + intermediate_shape[-1] = self.intermediate_dim + self._feedforward_output_dense.build(tuple(intermediate_shape)) + self._feedforward_layer_norm = keras.layers.LayerNormalization( + epsilon=self.layer_norm_epsilon, + dtype=self.dtype_policy, + name="feedforward_layer_norm", + ) + self._feedforward_layer_norm.build(decoder_sequence_shape) + self._feedforward_dropout = keras.layers.Dropout( + rate=self.dropout, + dtype=self.dtype_policy, + name="feedforward_dropout", + ) + # Create layers based on input shape. + self.built = True diff --git a/keras_nlp/models/whisper/whisper_encoder.py b/keras_nlp/models/whisper/whisper_encoder.py index 31267cbf78..9d5b41d0d2 100644 --- a/keras_nlp/models/whisper/whisper_encoder.py +++ b/keras_nlp/models/whisper/whisper_encoder.py @@ -11,23 +11,95 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Whisper encoder block.""" + from keras_nlp.backend import keras from keras_nlp.layers.modeling.transformer_encoder import TransformerEncoder +from keras_nlp.models.whisper.whisper_cached_multi_head_attention import ( + WhisperCachedMultiHeadAttention, +) +from keras_nlp.utils.keras_utils import clone_initializer @keras.saving.register_keras_serializable(package="keras_nlp") class WhisperEncoder(TransformerEncoder): - """A Whisper encoder. + """Whisper encoder. Inherits from `keras_nlp.layers.TransformerEncoder`, and overrides the - `build` method so as to remove the bias term from the key projection layer. + `_build` method to use the + `keras_nlp.models.whisper.whisper_multi_head_attention.WhisperCachedMultiHeadAttention` + layer instead of `keras.layers.MultiHeadAttention`. """ def build(self, inputs_shape): - super().build(inputs_shape) + # Infer the dimension of our hidden feature size from the build shape. + hidden_dim = inputs_shape[-1] + # Attention head size is `hidden_dim` over the number of heads. + key_dim = int(hidden_dim // self.num_heads) + if key_dim == 0: + raise ValueError( + "Attention `key_dim` computed cannot be zero. " + f"The `hidden_dim` value of {hidden_dim} has to be equal to " + f"or greater than `num_heads` value of {self.num_heads}." + ) + + # Self attention layers. + self._self_attention_layer = WhisperCachedMultiHeadAttention( + num_heads=self.num_heads, + key_dim=key_dim, + dropout=self.dropout, + kernel_initializer=clone_initializer(self.kernel_initializer), + bias_initializer=clone_initializer(self.bias_initializer), + dtype=self.dtype_policy, + name="self_attention_layer", + ) + self._self_attention_layer.build( + query_shape=inputs_shape, + value_shape=inputs_shape, + ) + + self._self_attention_layer_norm = keras.layers.LayerNormalization( + epsilon=self.layer_norm_epsilon, + dtype=self.dtype_policy, + name="self_attention_layer_norm", + ) + self._self_attention_layer_norm.build(inputs_shape) + self._self_attention_dropout = keras.layers.Dropout( + rate=self.dropout, + dtype=self.dtype_policy, + name="self_attention_dropout", + ) - # Since there is no exposed option for this in MHA, we will reach into - # the internals of the layer for now. - self._self_attention_layer._key_dense.bias_axes = None - self._self_attention_layer._key_dense.bias = None + # Feedforward layers. + self._feedforward_layer_norm = keras.layers.LayerNormalization( + epsilon=self.layer_norm_epsilon, + dtype=self.dtype_policy, + name="feedforward_layer_norm", + ) + self._feedforward_layer_norm.build(inputs_shape) + self._feedforward_intermediate_dense = keras.layers.Dense( + self.intermediate_dim, + activation=self.activation, + kernel_initializer=clone_initializer(self.kernel_initializer), + bias_initializer=clone_initializer(self.bias_initializer), + dtype=self.dtype_policy, + name="feedforward_intermediate_dense", + ) + self._feedforward_intermediate_dense.build(inputs_shape) + self._feedforward_output_dense = keras.layers.Dense( + hidden_dim, + kernel_initializer=clone_initializer(self.kernel_initializer), + bias_initializer=clone_initializer(self.bias_initializer), + dtype=self.dtype_policy, + name="feedforward_output_dense", + ) + intermediate_shape = list(inputs_shape) + intermediate_shape[-1] = self.intermediate_dim + self._feedforward_output_dense.build(tuple(intermediate_shape)) + self._feedforward_dropout = keras.layers.Dropout( + rate=self.dropout, + dtype=self.dtype_policy, + name="feedforward_dropout", + ) + self.built = True diff --git a/keras_nlp/models/whisper/whisper_preprocessor.py b/keras_nlp/models/whisper/whisper_preprocessor.py index 88fd60cae2..dbac2e11fc 100644 --- a/keras_nlp/models/whisper/whisper_preprocessor.py +++ b/keras_nlp/models/whisper/whisper_preprocessor.py @@ -14,10 +14,8 @@ import copy -import tensorflow as tf from absl import logging -from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.layers.preprocessing.start_end_packer import StartEndPacker from keras_nlp.models.preprocessor import Preprocessor @@ -31,10 +29,9 @@ ) from keras_nlp.utils.keras_utils import pack_x_y_sample_weight from keras_nlp.utils.python_utils import classproperty -from keras_nlp.utils.python_utils import format_docstring -@keras_nlp_export("keras_nlp.models.WhisperPreprocessor") +@keras.saving.register_keras_serializable(package="keras_nlp") class WhisperPreprocessor(Preprocessor): """A Whisper preprocessing layer which handles audio and text input. @@ -50,9 +47,11 @@ class WhisperPreprocessor(Preprocessor): directly to a Whisper model. Args: - audio_feature_extractor: A `keras_nlp.models.WhisperAudioFeatureExtractor` - instance. tokenizer: A `keras_nlp.models.WhisperTokenizer` instance. + audio_feature_extractor: A + `keras_nlp.models.WhisperAudioFeatureExtractor` instance or `None`. + If `None` a feature extractor with default parameters will be + created. decoder_sequence_length: The length of the packed decoder inputs. language: string, language token. Should only be passed if your tokenizer is multilingual. @@ -74,7 +73,9 @@ class WhisperPreprocessor(Preprocessor): Directly calling the layer on data. ```python - preprocessor = keras_nlp.models.WhisperPreprocessor.from_preset("whisper_tiny_en") + preprocessor = keras_nlp.models.WhisperPreprocessor.from_preset( + "whisper_tiny_en", + ) # Preprocess unbatched inputs. input_data = { @@ -154,8 +155,8 @@ class WhisperPreprocessor(Preprocessor): def __init__( self, - audio_feature_extractor, tokenizer, + audio_feature_extractor=None, decoder_sequence_length=448, language=None, task=None, @@ -163,50 +164,65 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + if audio_feature_extractor is None: + audio_feature_extractor = WhisperAudioFeatureExtractor() self.audio_feature_extractor = audio_feature_extractor self.tokenizer = tokenizer + self.decoder_sequence_length = decoder_sequence_length + self.language = language + self.task = task + self.no_timestamps = no_timestamps + self.decoder_packer = None + + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. # Create list of tokens to be prepended to decoder inputs. bos_tokens = [self.tokenizer.bos_token_id] if self.tokenizer.language_tokens is not None: if ( - language is None - or language not in self.tokenizer.language_tokens + self.language is None + or self.language not in self.tokenizer.language_tokens ): raise ValueError( "You must pass a non-None value for `language` when using " "a multilingual tokenizer. The value must be one of " f'{",".join(self.tokenizer.language_tokens.keys())}. ' - f"Received: language={language}." + f"Received: language={self.language}." ) - if task is None or task not in ["transcribe", "translate"]: + if self.task is None or self.task not in [ + "transcribe", + "translate", + ]: raise ValueError( "You must pass a non-None value for `task` when using " "a multilingual tokenizer. The value must be one of " - f'`"transcribe"`, `"translate"`. Received: task={task}.' + '`"transcribe"`, `"translate"`. ' + f"Received: task={self.task}." ) - bos_tokens += [self.tokenizer.language_tokens[language]] + bos_tokens += [self.tokenizer.language_tokens[self.language]] - if task == "transcribe": + if self.task == "transcribe": bos_tokens += [self.tokenizer.special_tokens["<|transcribe|>"]] - elif task == "translate": + elif self.task == "translate": bos_tokens += [self.tokenizer.special_tokens["<|translate|>"]] else: - if language is not None: + if self.language is not None: logging.info( "`tokenizer` is monolingual, and `language` has a " "non-`None` value. Setting `language` to `None`." ) - language = None - if task is not None: + self.language = None + if self.task is not None: logging.info( "`tokenizer` is monolingual, and `task` has a " "non-`None` value. Setting `task` to `None`." ) - task = None + self.task = None - if no_timestamps: + if self.no_timestamps: bos_tokens += [self.tokenizer.no_timestamps_token_id] # TODO: Use `MultiSegmentPacker` instead of `StartEndPacker` once we @@ -216,44 +232,10 @@ def __init__( start_value=bos_tokens, end_value=self.tokenizer.eos_token_id, pad_value=self.tokenizer.pad_token_id, - sequence_length=decoder_sequence_length, + sequence_length=self.decoder_sequence_length, return_padding_mask=True, ) - self.decoder_sequence_length = decoder_sequence_length - self.language = language - self.task = task - self.no_timestamps = no_timestamps - - def get_config(self): - config = super().get_config() - config.update( - { - "audio_feature_extractor": keras.layers.serialize( - self.audio_feature_extractor - ), - "decoder_sequence_length": self.decoder_sequence_length, - "language": self.language, - "task": self.task, - "no_timestamps": self.no_timestamps, - } - ) - return config - - @classmethod - def from_config(cls, config): - if "tokenizer" in config and isinstance(config["tokenizer"], dict): - config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) - - if "audio_feature_extractor" in config and isinstance( - config["audio_feature_extractor"], dict - ): - config["audio_feature_extractor"] = keras.layers.deserialize( - config["audio_feature_extractor"] - ) - - return cls(**config) - def call(self, x, y=None, sample_weight=None, decoder_sequence_length=None): if not ( isinstance(x, dict) @@ -278,9 +260,6 @@ def call(self, x, y=None, sample_weight=None, decoder_sequence_length=None): ) encoder_features = self.audio_feature_extractor(encoder_audio[0]) - if encoder_audio[0].shape.rank < 2: - encoder_features = tf.squeeze(encoder_features, axis=0) - decoder_sequence_length = ( decoder_sequence_length or self.decoder_sequence_length ) @@ -298,6 +277,35 @@ def call(self, x, y=None, sample_weight=None, decoder_sequence_length=None): return pack_x_y_sample_weight(x, y, sample_weight) + def get_config(self): + config = super().get_config() + config.update( + { + "audio_feature_extractor": keras.layers.serialize( + self.audio_feature_extractor + ), + "decoder_sequence_length": self.decoder_sequence_length, + "language": self.language, + "task": self.task, + "no_timestamps": self.no_timestamps, + } + ) + return config + + @classmethod + def from_config(cls, config): + if "tokenizer" in config and isinstance(config["tokenizer"], dict): + config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) + + if "audio_feature_extractor" in config and isinstance( + config["audio_feature_extractor"], dict + ): + config["audio_feature_extractor"] = keras.layers.deserialize( + config["audio_feature_extractor"] + ) + + return cls(**config) + @classproperty def audio_feature_extractor_cls(cls): return WhisperAudioFeatureExtractor @@ -309,97 +317,3 @@ def tokenizer_cls(cls): @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - def from_preset( - cls, - preset, - language=None, - task=None, - no_timestamps=True, - **kwargs, - ): - """Instantiate `WhisperPreprocessor` from preset architecture. - - Args: - preset: string. Must be one of "{{preset_names}}". - language: string, language token (eg., `"<|en|>"`). Should only be - passed if your tokenizer is multilingual. - task: string, task name. One of `"transcribe"`, `"translate"`. - Should only be passed if your tokenizer is multilingual. - no_timestamps: bool. If True, `"<|no_timestamps|>"` will be added as - a special token to your input. - - Examples: - ```python - # Load a preprocessor layer from a preset. - preprocessor = keras_nlp.models.WhisperPreprocessor.from_preset( - "{{example_preset_name}}", - ) - ``` - """ - # Override base class's `from_preset` to handle audio feature extractor - # , `decoder_sequence_length` and special tokens. - if not cls.presets: - raise NotImplementedError( - "No presets have been created for this class." - ) - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - - audio_feature_extractor = cls.audio_feature_extractor_cls.from_preset( - preset - ) - tokenizer = cls.tokenizer_cls.from_preset(preset) - - metadata = cls.presets[preset] - # For task model presets, the backbone config is nested. - if "backbone" in metadata["config"]: - backbone_config = metadata["config"]["backbone"]["config"] - else: - backbone_config = metadata["config"] - - # Use model's `max_decoder_sequence_length` if `decoder_sequence_length` - # is unspecified; otherwise check that `decoder_sequence_length` is not - # too long. - decoder_sequence_length = kwargs.pop("decoder_sequence_length", None) - max_decoder_sequence_length = backbone_config[ - "max_decoder_sequence_length" - ] - - def check_sequence_length(sequence_length, max_sequence_length, name): - if sequence_length is not None: - if sequence_length > max_sequence_length: - raise ValueError( - f"`{name}` cannot be longer than `{preset}` " - f"preset's `max_{name}` of {max_sequence_length}. " - f"Received: {sequence_length}." - ) - return sequence_length - else: - return max_sequence_length - - decoder_sequence_length = check_sequence_length( - decoder_sequence_length, - max_decoder_sequence_length, - "decoder_sequence_length", - ) - - return cls( - audio_feature_extractor=audio_feature_extractor, - tokenizer=tokenizer, - decoder_sequence_length=decoder_sequence_length, - language=language, - task=task, - no_timestamps=no_timestamps, - **kwargs, - ) - - -format_docstring( - example_preset_name=next(iter(backbone_presets), ""), - preset_names='", "'.join(backbone_presets), -)(WhisperPreprocessor.from_preset.__func__) diff --git a/keras_nlp/models/whisper/whisper_preprocessor_test.py b/keras_nlp/models/whisper/whisper_preprocessor_test.py index 3f07ef618f..6837dc8bfa 100644 --- a/keras_nlp/models/whisper/whisper_preprocessor_test.py +++ b/keras_nlp/models/whisper/whisper_preprocessor_test.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tensorflow as tf +import numpy as np -from keras_nlp.backend import keras from keras_nlp.models.whisper.whisper_audio_feature_extractor import ( WhisperAudioFeatureExtractor, ) @@ -25,40 +24,18 @@ class WhisperPreprocessorTest(TestCase): def setUp(self): - self.num_mels = 80 - self.num_fft_bins = 400 - self.stride = 100 - self.sampling_rate = 100 - self.max_audio_length = 5 - self.output_length = ( - self.max_audio_length * self.sampling_rate - ) // self.stride self.audio_feature_extractor = WhisperAudioFeatureExtractor( - num_mels=self.num_mels, - num_fft_bins=self.num_fft_bins, - stride=self.stride, - sampling_rate=self.sampling_rate, - max_audio_length=self.max_audio_length, - ) - - self.vocab = { - "Ġair": 0, - "plane": 1, - "Ġat": 2, - "port": 3, - "Ġkoh": 4, - "li": 5, - "Ġis": 6, - "Ġthe": 7, - "Ġbest": 8, - } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - self.merges = merges - + num_mels=80, + num_fft_bins=400, + stride=100, + sampling_rate=100, + max_audio_length=5, + ) + self.vocab = ["air", "Ġair", "plane", "Ġat", "port"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] self.special_tokens = { "<|startoftranscript|>": 9, "<|endoftext|>": 10, @@ -66,117 +43,40 @@ def setUp(self): "<|transcribe|>": 12, "<|translate|>": 13, } - self.language_tokens = { "<|en|>": 14, "<|fr|>": 15, } - self.tokenizer = WhisperTokenizer( vocabulary=self.vocab, merges=self.merges, special_tokens=self.special_tokens, language_tokens=self.language_tokens, ) - - self.preprocessor = WhisperPreprocessor( - audio_feature_extractor=self.audio_feature_extractor, - tokenizer=self.tokenizer, - decoder_sequence_length=12, - language="<|en|>", - task="translate", - ) - - def test_unbatched_preprocess(self): - input_data = { - "encoder_audio": tf.ones((200,)), - "decoder_text": tf.constant(" airplane at airport"), + self.init_kwargs = { + "audio_feature_extractor": self.audio_feature_extractor, + "tokenizer": self.tokenizer, + "decoder_sequence_length": 12, + "language": "<|en|>", + "task": "translate", } - - x = self.preprocessor(input_data) - self.assertAllEqual( - x["encoder_features"].shape, [self.output_length, self.num_mels] - ) - self.assertAllEqual( - x["decoder_token_ids"], [9, 14, 13, 11, 0, 1, 2, 0, 3, 10, 10, 10] - ) - self.assertAllEqual( - x["decoder_padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0] - ) - - def test_preprocess_batch(self): - input_data = { - "encoder_audio": tf.ones((4, 200)), - "decoder_text": tf.constant([" airplane at airport"] * 4), + self.input_data = { + "encoder_audio": np.ones((2, 200)), + "decoder_text": [" airplane at airport", " airplane at"], } - x = self.preprocessor(input_data) - self.assertAllEqual( - x["encoder_features"].shape, [4, self.output_length, self.num_mels] - ) - self.assertAllEqual( - x["decoder_token_ids"], - [[9, 14, 13, 11, 0, 1, 2, 0, 3, 10, 10, 10]] * 4, - ) - self.assertAllEqual( - x["decoder_padding_mask"], - [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 4, - ) - - def test_preprocess_labeled_batch(self): - x = { - "encoder_audio": tf.ones((4, 200)), - "decoder_text": tf.constant([" airplane at airport"] * 4), - } - y_in = tf.constant([1] * 4) - sw_in = tf.constant([1.0] * 4) - x, y, sw = self.preprocessor(x, y_in, sw_in) - self.assertAllEqual( - x["encoder_features"].shape, [4, self.output_length, self.num_mels] - ) - self.assertAllEqual( - x["decoder_token_ids"], - [[9, 14, 13, 11, 0, 1, 2, 0, 3, 10, 10, 10]] * 4, - ) - self.assertAllEqual( - x["decoder_padding_mask"], - [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 4, - ) - self.assertAllEqual(y, y_in) - self.assertAllEqual(sw, sw_in) - - def test_preprocess_dataset(self): - x = { - "encoder_audio": tf.ones((4, 200)), - "decoder_text": tf.constant([" airplane at airport"] * 4), - } - ds = tf.data.Dataset.from_tensor_slices(x) - ds = ds.map(self.preprocessor) - x = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x["encoder_features"].shape, [4, self.output_length, self.num_mels] - ) - self.assertAllEqual( - x["decoder_token_ids"], - [[9, 14, 13, 11, 0, 1, 2, 0, 3, 10, 10, 10]] * 4, - ) - self.assertAllEqual( - x["decoder_padding_mask"], - [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 4, + def test_feature_extractor_basics(self): + self.run_preprocessing_layer_test( + cls=WhisperPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, ) def test_sequence_length_override(self): input_data = { - "encoder_audio": tf.ones((200,)), - "decoder_text": tf.constant(" airplane at airport"), + "encoder_audio": np.ones((200,)), + "decoder_text": " airplane at airport", } - x = self.preprocessor(input_data, decoder_sequence_length=6) - self.assertAllEqual(x["decoder_token_ids"], [9, 14, 13, 11, 0, 10]) - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + preprocessor = WhisperPreprocessor(**self.init_kwargs) + x = preprocessor(input_data, decoder_sequence_length=6) + self.assertAllEqual(x["decoder_token_ids"], [9, 14, 13, 11, 1, 10]) diff --git a/keras_nlp/models/whisper/whisper_presets.py b/keras_nlp/models/whisper/whisper_presets.py index e8c0d075a4..b881ee57b9 100644 --- a/keras_nlp/models/whisper/whisper_presets.py +++ b/keras_nlp/models/whisper/whisper_presets.py @@ -11,131 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -MULTILINGUAL_SPECIAL_TOKENS = { - "<|startoftranscript|>": 50258, - "<|endoftext|>": 50257, - "<|notimestamps|>": 50363, - "<|translate|>": 50359, - "<|transcribe|>": 50358, -} - -ENGLISH_SPECIAL_TOKENS = { - "<|startoftranscript|>": 50257, - "<|endoftext|>": 50256, - "<|notimestamps|>": 50362, - "<|translate|>": 50358, - "<|transcribe|>": 50357, -} - -AUDIO_FEATURE_EXTRACTOR_CONFIG = { - "num_mels": 80, - "num_fft_bins": 400, - "stride": 160, - "sampling_rate": 16000, - "max_audio_length": 30, -} - -LANGUAGE_TOKENS = { - "<|af|>": 50327, - "<|am|>": 50334, - "<|ar|>": 50272, - "<|as|>": 50350, - "<|az|>": 50304, - "<|ba|>": 50355, - "<|be|>": 50330, - "<|bg|>": 50292, - "<|bn|>": 50302, - "<|bo|>": 50347, - "<|br|>": 50309, - "<|bs|>": 50315, - "<|ca|>": 50270, - "<|cs|>": 50283, - "<|cy|>": 50297, - "<|da|>": 50285, - "<|de|>": 50261, - "<|el|>": 50281, - "<|en|>": 50259, - "<|es|>": 50262, - "<|et|>": 50307, - "<|eu|>": 50310, - "<|fa|>": 50300, - "<|fi|>": 50277, - "<|fo|>": 50338, - "<|fr|>": 50265, - "<|gl|>": 50319, - "<|gu|>": 50333, - "<|haw|>": 50352, - "<|ha|>": 50354, - "<|he|>": 50279, - "<|hi|>": 50276, - "<|hr|>": 50291, - "<|ht|>": 50339, - "<|hu|>": 50286, - "<|hy|>": 50312, - "<|id|>": 50275, - "<|is|>": 50311, - "<|it|>": 50274, - "<|ja|>": 50266, - "<|jw|>": 50356, - "<|ka|>": 50329, - "<|kk|>": 50316, - "<|km|>": 50323, - "<|kn|>": 50306, - "<|ko|>": 50264, - "<|la|>": 50294, - "<|lb|>": 50345, - "<|ln|>": 50353, - "<|lo|>": 50336, - "<|lt|>": 50293, - "<|lv|>": 50301, - "<|mg|>": 50349, - "<|mi|>": 50295, - "<|mk|>": 50308, - "<|ml|>": 50296, - "<|mn|>": 50314, - "<|mr|>": 50320, - "<|ms|>": 50282, - "<|mt|>": 50343, - "<|my|>": 50346, - "<|ne|>": 50313, - "<|nl|>": 50271, - "<|nn|>": 50342, - "<|no|>": 50288, - "<|oc|>": 50328, - "<|pa|>": 50321, - "<|pl|>": 50269, - "<|ps|>": 50340, - "<|pt|>": 50267, - "<|ro|>": 50284, - "<|ru|>": 50263, - "<|sa|>": 50344, - "<|sd|>": 50332, - "<|si|>": 50322, - "<|sk|>": 50298, - "<|sl|>": 50305, - "<|sn|>": 50324, - "<|so|>": 50326, - "<|sq|>": 50317, - "<|sr|>": 50303, - "<|su|>": 50357, - "<|sv|>": 50273, - "<|sw|>": 50318, - "<|ta|>": 50287, - "<|te|>": 50299, - "<|tg|>": 50331, - "<|th|>": 50289, - "<|tk|>": 50341, - "<|tl|>": 50348, - "<|tr|>": 50268, - "<|tt|>": 50351, - "<|uk|>": 50280, - "<|ur|>": 50290, - "<|uz|>": 50337, - "<|vi|>": 50278, - "<|yi|>": 50335, - "<|yo|>": 50325, - "<|zh|>": 50260, -} # Metadata for loading pretrained model weights. backbone_presets = { @@ -150,28 +25,7 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51864, - "num_layers": 4, - "num_heads": 6, - "hidden_dim": 384, - "intermediate_dim": 1536, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": ENGLISH_SPECIAL_TOKENS, - "language_tokens": None, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_tiny_en/v1/model.h5", - "weights_hash": "3dc3768ac48ec90b1029fbf52ffbacc7", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_tiny_en/v1/vocab.json", - "vocabulary_hash": "22377f841debacb023848b3468ea3281", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_tiny_en/v1/merges.txt", - "merges_hash": "093ecf3f30371012f2e96fcfb10ea6ab", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_tiny_en/2", }, "whisper_base_en": { "metadata": { @@ -184,28 +38,7 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51864, - "num_layers": 6, - "num_heads": 8, - "hidden_dim": 512, - "intermediate_dim": 2048, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": ENGLISH_SPECIAL_TOKENS, - "language_tokens": None, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_en/v1/model.h5", - "weights_hash": "799d3c143993d42f7446bafbc0f46d7d", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_en/v1/vocab.json", - "vocabulary_hash": "22377f841debacb023848b3468ea3281", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_en/v1/merges.txt", - "merges_hash": "093ecf3f30371012f2e96fcfb10ea6ab", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_base_en/2", }, "whisper_small_en": { "metadata": { @@ -218,28 +51,7 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51864, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": ENGLISH_SPECIAL_TOKENS, - "language_tokens": None, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_en/v1/model.h5", - "weights_hash": "b75a89225e20019d85ff5f1c362f8a49", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_en/v1/vocab.json", - "vocabulary_hash": "22377f841debacb023848b3468ea3281", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_en/v1/merges.txt", - "merges_hash": "093ecf3f30371012f2e96fcfb10ea6ab", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_small_en/2", }, "whisper_medium_en": { "metadata": { @@ -252,28 +64,7 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51864, - "num_layers": 24, - "num_heads": 16, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": ENGLISH_SPECIAL_TOKENS, - "language_tokens": None, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_medium_en/v1/model.h5", - "weights_hash": "107184882d1cc65926815e4cc50dc5f3", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_medium_en/v1/vocab.json", - "vocabulary_hash": "22377f841debacb023848b3468ea3281", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_medium_en/v1/merges.txt", - "merges_hash": "093ecf3f30371012f2e96fcfb10ea6ab", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_medium_en/2", }, "whisper_tiny_multi": { "metadata": { @@ -286,28 +77,7 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51865, - "num_layers": 4, - "num_heads": 6, - "hidden_dim": 384, - "intermediate_dim": 1536, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": MULTILINGUAL_SPECIAL_TOKENS, - "language_tokens": LANGUAGE_TOKENS, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_tiny_multi/v1/model.h5", - "weights_hash": "b1279a81001ad5eb35970d1aea706396", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_tiny_multi/v1/vocab.json", - "vocabulary_hash": "1b87ed3e3ecd9ccfdca74e64cbe81d68", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_tiny_multi/v1/merges.txt", - "merges_hash": "c7f01d4100f6211417988889bf35ccd8", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_tiny_multi/2", }, "whisper_base_multi": { "metadata": { @@ -320,28 +90,7 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51865, - "num_layers": 6, - "num_heads": 8, - "hidden_dim": 512, - "intermediate_dim": 2048, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": MULTILINGUAL_SPECIAL_TOKENS, - "language_tokens": LANGUAGE_TOKENS, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_multi/v1/model.h5", - "weights_hash": "5208396e2d5efac43114a4a3d4f583ab", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_multi/v1/vocab.json", - "vocabulary_hash": "1b87ed3e3ecd9ccfdca74e64cbe81d68", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_multi/v1/merges.txt", - "merges_hash": "c7f01d4100f6211417988889bf35ccd8", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_base_multi/2", }, "whisper_small_multi": { "metadata": { @@ -354,28 +103,7 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51865, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": MULTILINGUAL_SPECIAL_TOKENS, - "language_tokens": LANGUAGE_TOKENS, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_multi/v1/model.h5", - "weights_hash": "c90c6a895e522056b77b924b6e907ed8", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_multi/v1/vocab.json", - "vocabulary_hash": "1b87ed3e3ecd9ccfdca74e64cbe81d68", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_base_multi/v1/merges.txt", - "merges_hash": "c7f01d4100f6211417988889bf35ccd8", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_small_multi/2", }, "whisper_medium_multi": { "metadata": { @@ -388,28 +116,7 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51865, - "num_layers": 24, - "num_heads": 16, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": MULTILINGUAL_SPECIAL_TOKENS, - "language_tokens": LANGUAGE_TOKENS, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_medium_multi/v1/model.h5", - "weights_hash": "6f993f732fe397e9c5e3a96a9505a3a9", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_medium_multi/v1/vocab.json", - "vocabulary_hash": "1b87ed3e3ecd9ccfdca74e64cbe81d68", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_medium_multi/v1/merges.txt", - "merges_hash": "c7f01d4100f6211417988889bf35ccd8", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_medium_multi/2", }, "whisper_large_multi": { "metadata": { @@ -422,28 +129,7 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51865, - "num_layers": 32, - "num_heads": 20, - "hidden_dim": 1280, - "intermediate_dim": 5120, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": MULTILINGUAL_SPECIAL_TOKENS, - "language_tokens": LANGUAGE_TOKENS, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_large_multi/v1/model.h5", - "weights_hash": "ccab1c93c5739007868ae73fe025806d", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_large_multi/v1/vocab.json", - "vocabulary_hash": "1b87ed3e3ecd9ccfdca74e64cbe81d68", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_large_multi/v1/merges.txt", - "merges_hash": "c7f01d4100f6211417988889bf35ccd8", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_large_multi/2", }, "whisper_large_multi_v2": { "metadata": { @@ -457,27 +143,6 @@ "path": "whisper", "model_card": "https://github.com/openai/whisper/blob/main/model-card.md", }, - "config": { - "vocabulary_size": 51865, - "num_layers": 32, - "num_heads": 20, - "hidden_dim": 1280, - "intermediate_dim": 5120, - "num_mels": 80, - "dropout": 0.0, - "max_encoder_sequence_length": 3000, - "max_decoder_sequence_length": 448, - }, - "audio_feature_extractor_config": AUDIO_FEATURE_EXTRACTOR_CONFIG, - "preprocessor_config": { - "special_tokens": MULTILINGUAL_SPECIAL_TOKENS, - "language_tokens": LANGUAGE_TOKENS, - }, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/whisper_large_multi_v2/v1/model.h5", - "weights_hash": "ca157162ec9c3329a659388528a3af88", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/whisper_large_multi_v2/v1/vocab.json", - "vocabulary_hash": "1b87ed3e3ecd9ccfdca74e64cbe81d68", - "merges_url": "https://storage.googleapis.com/keras-nlp/models/whisper_large_multi_v2/v1/merges.txt", - "merges_hash": "c7f01d4100f6211417988889bf35ccd8", + "kaggle_handle": "kaggle://keras/whisper/keras/whisper_large_multi_v2/2", }, } diff --git a/keras_nlp/models/whisper/whisper_presets_test.py b/keras_nlp/models/whisper/whisper_presets_test.py deleted file mode 100644 index 09529c66f3..0000000000 --- a/keras_nlp/models/whisper/whisper_presets_test.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -import tensorflow as tf -from absl.testing import parameterized - -from keras_nlp.models.whisper.whisper_audio_feature_extractor import ( - WhisperAudioFeatureExtractor, -) -from keras_nlp.models.whisper.whisper_backbone import WhisperBackbone -from keras_nlp.models.whisper.whisper_tokenizer import WhisperTokenizer - - -@pytest.mark.tf_only -@pytest.mark.large -class WhisperPresetSmokeTest(tf.test.TestCase, parameterized.TestCase): - """ - A smoke test for Whisper presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/whisper/whisper_presets_test.py --run_large` - """ - - def test_audio_feature_extractor_output(self): - audio_feature_extractor = WhisperAudioFeatureExtractor.from_preset( - "whisper_tiny_en" - ) - # Don't really need to check for output here. - audio_feature_extractor(tf.ones((200,))) - - def test_tokenizer_output(self): - tokenizer = WhisperTokenizer.from_preset("whisper_tiny_en") - outputs = tokenizer("The quick brown fox.") - expected_outputs = [464, 2068, 7586, 21831, 13] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - @pytest.mark.skip # TODO: fix weight mismatch error. - def test_backbone_output(self, load_weights): - input_data = { - "encoder_features": tf.ones((1, 3000, 80)), - "decoder_token_ids": tf.constant( - [[50257, 50362, 464, 2068, 7586, 21831, 13, 50256, 50256]] - ), - "decoder_padding_mask": tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0]]), - } - model = WhisperBackbone.from_preset( - "whisper_tiny_en", load_weights=load_weights - ) - outputs = model(input_data)["decoder_sequence_output"][0, 0, :5] - if load_weights: - # The forward pass from a preset should be stable! - # This test should catch cases where we unintentionally change our - # network code in a way that would invalidate our preset weights. - # We should only update these numbers if we are updating a weights - # file, or have found a discrepancy with the upstream source. - expected_outputs = [13.238, 1.051, 8.348, -20.012, -5.022] - # Keep a high tolerance, so we are robust to different hardware. - self.assertAllClose(outputs, expected_outputs, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("whisper_tokenizer", WhisperTokenizer), - ("whisper", WhisperBackbone), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("whisper_tokenizer", WhisperTokenizer), - ("whisper", WhisperBackbone), - ) - def test_unknown_preset_error(self, cls): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("whisper_tiny_en_clowntown") - - -@pytest.mark.extra_large -class WhisperPresetFullTest(tf.test.TestCase, parameterized.TestCase): - """ - Test the full enumeration of our preset. - - This tests every Whisper preset and is only run manually. - Run with: - `pytest keras_nlp/models/whisper/whisper_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_whisper(self, load_weights): - for preset in WhisperBackbone.presets: - model = WhisperBackbone.from_preset( - preset, load_weights=load_weights - ) - input_data = { - "encoder_features": tf.ones((1, 3000, 80)), - "decoder_token_ids": tf.random.uniform( - shape=(1, 446), - dtype="int64", - maxval=model.vocabulary_size, - ), - "decoder_padding_mask": tf.constant([1] * 446, shape=(1, 446)), - } - model(input_data) - - def test_load_tokenizers(self): - for preset in WhisperTokenizer.presets: - tokenizer = WhisperTokenizer.from_preset(preset) - tokenizer("The quick brown fox.") - - def test_load_audio_feature_extractors(self): - for preset in WhisperAudioFeatureExtractor.presets: - audio_feature_extractor = WhisperAudioFeatureExtractor.from_preset( - preset - ) - audio_feature_extractor(tf.ones((200,))) diff --git a/keras_nlp/models/whisper/whisper_tokenizer.py b/keras_nlp/models/whisper/whisper_tokenizer.py index b1406b0a04..4446193738 100644 --- a/keras_nlp/models/whisper/whisper_tokenizer.py +++ b/keras_nlp/models/whisper/whisper_tokenizer.py @@ -15,7 +15,7 @@ import copy import json -from keras_nlp.api_export import keras_nlp_export +from keras_nlp.backend import keras from keras_nlp.models.whisper.whisper_presets import backbone_presets from keras_nlp.tokenizers.byte_pair_tokenizer import BytePairTokenizer from keras_nlp.utils.python_utils import classproperty @@ -23,12 +23,12 @@ def _load_dict(dict_or_path): if isinstance(dict_or_path, str): - with open(dict_or_path, "r") as f: + with open(dict_or_path, "r", encoding="utf-8") as f: dict_or_path = json.load(f) return dict_or_path -@keras_nlp_export("keras_nlp.models.WhisperTokenizer") +@keras.saving.register_keras_serializable(package="keras_nlp") class WhisperTokenizer(BytePairTokenizer): """Whisper text tokenizer using Byte-Pair Encoding subword segmentation. @@ -52,49 +52,36 @@ class WhisperTokenizer(BytePairTokenizer): def __init__( self, - vocabulary, - merges, - special_tokens, + vocabulary=None, + merges=None, + special_tokens=None, language_tokens=None, **kwargs, ): - vocabulary = _load_dict(vocabulary) - - # Necessary special tokens. - bos_token = "<|startoftranscript|>" - eos_token = "<|endoftext|>" - + special_tokens = _load_dict(special_tokens) if language_tokens is not None: - # Multilingual tokenizer. - # TODO: The pad token for the multilingual tokenizer is actually - # "", but it errors out (OOM). After BPE is fixed, we can update - # this to "". For now, we will use `"<|endoftext|>"`. - pad_token = "<|endoftext|>" language_tokens = _load_dict(language_tokens) - # Add language tokens to the vocabulary. This makes detokenization - # easier for us. - vocabulary = { - **vocabulary, - **language_tokens, - } - else: - # English tokenizer. - pad_token = "<|endoftext|>" - - no_timestamps_token = "<|notimestamps|>" + # Necessary special tokens. + self.bos_token = "<|startoftranscript|>" + self.eos_token = "<|endoftext|>" + # TODO: The pad token for the multilingual tokenizer is actually + # "", but it errors out (OOM). After BPE is fixed, we can update + # this to "". For now, we will use `"<|endoftext|>"`. + self.pad_token = "<|endoftext|>" + + self.no_timestamps_token = "<|notimestamps|>" # Task special tokens. - translate_token = "<|translate|>" - transcribe_token = "<|transcribe|>" + self.translate_token = "<|translate|>" + self.transcribe_token = "<|transcribe|>" - special_tokens = _load_dict(special_tokens) for token in [ - bos_token, - eos_token, - pad_token, - no_timestamps_token, - translate_token, - transcribe_token, + self.bos_token, + self.eos_token, + self.pad_token, + self.no_timestamps_token, + self.translate_token, + self.transcribe_token, ]: if token not in special_tokens: raise ValueError( @@ -102,15 +89,16 @@ def __init__( f"`special_tokens`. Please provide `'{token}'` in your " "`special_tokens`." ) - # Add special tokens to `vocabulary` for easy detokenization. - vocabulary[token] = special_tokens[token] - self.bos_token_id = special_tokens[bos_token] - self.eos_token_id = special_tokens[eos_token] - self.pad_token_id = special_tokens[pad_token] - self.no_timestamps_token_id = special_tokens[no_timestamps_token] - self.translate_token_id = special_tokens[translate_token] - self.transcribe_token_id = special_tokens[transcribe_token] + self.bos_token_id = special_tokens[self.bos_token] + self.eos_token_id = special_tokens[self.eos_token] + self.pad_token_id = special_tokens[self.pad_token] + self.no_timestamps_token_id = special_tokens[self.no_timestamps_token] + self.translate_token_id = special_tokens[self.translate_token] + self.transcribe_token_id = special_tokens[self.transcribe_token] + + self.special_tokens = special_tokens + self.language_tokens = language_tokens # TODO: Add language tokens to `unsplittable_tokens` once we figure # out the performance issue with a large list. @@ -123,8 +111,43 @@ def __init__( **kwargs, ) - self.special_tokens = special_tokens - self.language_tokens = language_tokens + def save_assets(self, dir_path): + # TODO: whisper is currently mutating it's vocabulary before passing + # it to the super class, so we need to restore the unmutated vocabulary + # before saving our assets. We should find a more robust (and memory + # efficient) way to do this. + vocabulary = self.vocabulary + self.vocabulary = self._initial_vocabulary + super().save_assets(dir_path) + self.vocabulary = vocabulary + + def set_vocabulary_and_merges(self, vocabulary, merges): + if vocabulary is not None: + vocabulary = _load_dict(vocabulary) + self._initial_vocabulary = dict(vocabulary) + + if self.language_tokens is not None: + # Multilingual tokenizer. + # Add language tokens to the vocabulary. This makes + # detokenization easier for us. + vocabulary = { + **vocabulary, + **self.language_tokens, + } + + for token in [ + self.bos_token, + self.eos_token, + self.pad_token, + self.no_timestamps_token, + self.translate_token, + self.transcribe_token, + ]: + vocabulary[token] = self.special_tokens[token] + else: + self._initial_vocabulary = None + + super().set_vocabulary_and_merges(vocabulary, merges) def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/whisper/whisper_tokenizer_test.py b/keras_nlp/models/whisper/whisper_tokenizer_test.py index 5ebcb12e55..16fab2e34a 100644 --- a/keras_nlp/models/whisper/whisper_tokenizer_test.py +++ b/keras_nlp/models/whisper/whisper_tokenizer_test.py @@ -18,27 +18,14 @@ from keras_nlp.tests.test_case import TestCase -@pytest.mark.tf_only class WhisperTokenizerTest(TestCase): def setUp(self): - self.vocab = { - "Ġair": 0, - "plane": 1, - "Ġat": 2, - "port": 3, - "Ġkoh": 4, - "li": 5, - "Ġis": 6, - "Ġthe": 7, - "Ġbest": 8, - } - - merges = ["Ġ a", "Ġ t", "Ġ k", "Ġ i", "Ġ b", "Ġa i", "p l", "n e"] - merges += ["Ġa t", "p o", "r t", "o h", "l i", "Ġi s", "Ġb e", "s t"] - merges += ["Ġt h", "Ġai r", "pl a", "Ġk oh", "Ġth e", "Ġbe st", "po rt"] - merges += ["pla ne"] - self.merges = merges - + self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] + self.vocab += ["<|endoftext|>"] + self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) + self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] + self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] + self.merges += ["Ġai r", "Ġa i", "pla ne"] self.special_tokens = { "<|startoftranscript|>": 9, "<|endoftext|>": 10, @@ -46,56 +33,58 @@ def setUp(self): "<|transcribe|>": 12, "<|translate|>": 13, } - self.language_tokens = { "<|en|>": 14, "<|fr|>": 15, } + self.init_kwargs = { + "vocabulary": self.vocab, + "merges": self.merges, + "special_tokens": self.special_tokens, + "language_tokens": self.language_tokens, + } + self.input_data = [ + " airplane at airport<|endoftext|>", + " airplane airport", + ] - self.tokenizer = WhisperTokenizer( - vocabulary=self.vocab, - merges=self.merges, - special_tokens=self.special_tokens, - language_tokens=self.language_tokens, - ) - - def test_tokenize(self): - input_data = " airplane at airport" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [0, 1, 2, 0, 3]) - - def test_tokenize_batch(self): - input_data = [" airplane at airport", " kohli is the best"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[0, 1, 2, 0, 3], [4, 5, 6, 7, 8]]) - - def test_detokenize(self): - input_tokens = [0, 1, 2, 0, 3] - output = self.tokenizer.detokenize(input_tokens) - self.assertEqual(output, " airplane at airport") - - def test_detokenize_with_special_tokens(self): - input_tokens = [9, 14, 12, 11, 0, 1, 2, 0, 3, 10] - output = self.tokenizer.detokenize(input_tokens) - print(output) - self.assertEqual( - output, - "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> airplane at airport<|endoftext|>", + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=WhisperTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[2, 3, 4, 2, 5, 10], [2, 3, 2, 5]], ) - def test_vocabulary_size(self): - self.assertEqual(self.tokenizer.vocabulary_size(), 16) - def test_special_tokens(self): - self.assertEqual(self.tokenizer.bos_token_id, 9) - self.assertEqual(self.tokenizer.eos_token_id, 10) - self.assertEqual(self.tokenizer.pad_token_id, 10) - self.assertEqual(self.tokenizer.no_timestamps_token_id, 11) - self.assertEqual(self.tokenizer.translate_token_id, 13) - self.assertEqual(self.tokenizer.transcribe_token_id, 12) + tokenizer = WhisperTokenizer(**self.init_kwargs) + self.assertEqual(tokenizer.bos_token_id, 9) + self.assertEqual(tokenizer.eos_token_id, 10) + self.assertEqual(tokenizer.pad_token_id, 10) + self.assertEqual(tokenizer.no_timestamps_token_id, 11) + self.assertEqual(tokenizer.translate_token_id, 13) + self.assertEqual(tokenizer.transcribe_token_id, 12) def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): WhisperTokenizer( vocabulary=["a", "b", "c"], merges=[], special_tokens={} ) + + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=WhisperTokenizer, + preset="whisper_tiny_en", + input_data=["The quick brown fox."], + expected_output=[[464, 2068, 7586, 21831, 13]], + ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in WhisperTokenizer.presets: + self.run_preset_test( + cls=WhisperTokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone_test.py index 26559f990d..e92aaea0ef 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone_test.py @@ -12,13 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone from keras_nlp.tests.test_case import TestCase @@ -26,91 +21,58 @@ class XLMRobertaBackboneTest(TestCase): def setUp(self): - self.backbone = XLMRobertaBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - max_sequence_length=5, - ) - self.input_batch = { - "token_ids": np.ones((2, 5), dtype="int32"), - "padding_mask": np.ones((2, 5), dtype="int32"), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + "max_sequence_length": 5, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "segment_ids": ops.zeros((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_valid_call_xlm_roberta(self): - self.backbone(self.input_batch) - - def test_token_embedding(self): - output = self.backbone.token_embedding(self.input_batch["token_ids"]) - self.assertEqual(output.shape, (2, 5, 2)) - - def test_name(self): - # Check default name passed through - self.assertRegexpMatches(self.backbone.name, "xlm_roberta_backbone") - - def test_variable_sequence_length_call_xlm_roberta(self): - for seq_length in (2, 3, 4): - input_data = { - "token_ids": np.ones((2, seq_length), dtype="int32"), - "padding_mask": np.ones((2, seq_length), dtype="int32"), - } - output = self.backbone(input_data) - self.assertAllEqual( - ops.shape(output), - (2, seq_length, self.backbone.hidden_dim), - ) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=XLMRobertaBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 2), ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) - @pytest.mark.large # Saving is slow, so mark these large. + @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, XLMRobertaBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose(model_output, restored_output) + self.run_model_saving_test( + cls=XLMRobertaBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=XLMRobertaBackbone, + preset="xlm_roberta_base_multi", + input_data={ + "token_ids": ops.array([[0, 581, 63773, 2]], dtype="int32"), + "segment_ids": ops.zeros((1, 4), dtype="int32"), + "padding_mask": ops.ones((1, 4), dtype="int32"), + }, + expected_output_shape=(1, 4, 768), + # The forward pass from a preset should be stable! + expected_partial_output=ops.array( + [0.084763, 0.097018, 0.051329, -0.000805, 0.028415], + ), + ) -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class XLMRobertaBackboneTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = XLMRobertaBackbone( - vocabulary_size=1000, - num_layers=2, - num_heads=2, - hidden_dim=64, - intermediate_dim=128, - max_sequence_length=128, + @pytest.mark.extra_large + def test_all_presets(self): + for preset in XLMRobertaBackbone.presets: + self.run_preset_test( + cls=XLMRobertaBackbone, + preset=preset, + input_data=self.input_data, ) - self.input_batch = { - "token_ids": np.ones((8, 128), dtype="int32"), - "padding_mask": np.ones((8, 128), dtype="int32"), - } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py index f96e40b5b4..bfa6500247 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py @@ -12,16 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os -import numpy as np import pytest -import sentencepiece -import tensorflow as tf -from keras_nlp.backend import keras -from keras_nlp.backend import ops from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone from keras_nlp.models.xlm_roberta.xlm_roberta_classifier import ( XLMRobertaClassifier, @@ -37,108 +31,58 @@ class XLMRobertaClassifierTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=10, - model_type="WORD", - unk_id=0, - bos_id=1, - eos_id=2, - ) + # Setup model. self.preprocessor = XLMRobertaPreprocessor( - tokenizer=XLMRobertaTokenizer(proto=bytes_io.getvalue()), + XLMRobertaTokenizer( + # Generated using create_xlm_roberta_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) + ), sequence_length=5, ) self.backbone = XLMRobertaBackbone( - vocabulary_size=10, + vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(), num_layers=2, num_heads=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, + max_sequence_length=self.preprocessor.sequence_length, ) - self.classifier = XLMRobertaClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - # Check we handle serialization correctly. - activation=keras.activations.softmax, - hidden_dim=4, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + "num_classes": 2, + } + self.train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. + [1, 0], # Labels. ) + self.input_data = self.preprocessor(*self.train_data)[0] - self.raw_batch = [ - "the quick brown fox.", - "the slow brown fox.", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch) - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - (self.raw_batch, np.ones((2,))) - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_classifier(self): - self.classifier(self.preprocessed_batch) - - def test_classifier_predict(self): - preds1 = self.classifier.predict(self.raw_batch) - self.classifier.preprocessor = None - preds2 = self.classifier.predict(self.preprocessed_batch) - # Assert predictions match. - self.assertAllClose(preds1, preds2) - # Assert valid softmax output. - self.assertAllClose(ops.sum(preds2, axis=-1), [1.0, 1.0]) - - def test_classifier_fit(self): - self.classifier.fit(self.raw_dataset) - self.classifier.preprocessor = None - self.classifier.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.classifier.preprocessor = None - self.classifier.compile( - loss="sparse_categorical_crossentropy", - jit_compile=False, + def test_classifier_basics(self): + self.run_task_test( + cls=XLMRobertaClassifier, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 2), ) - self.classifier.fit(self.preprocessed_dataset) - def test_serialization(self): - # Defaults. - original = XLMRobertaClassifier( - self.backbone, - num_classes=2, - ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) - # With options. - original = XLMRobertaClassifier( - self.backbone, - num_classes=4, - preprocessor=self.preprocessor, - activation=keras.activations.softmax, - hidden_dim=4, - name="test", - trainable=False, + @pytest.mark.large + def test_saved_model(self): + self.run_model_saving_test( + cls=XLMRobertaClassifier, + init_kwargs=self.init_kwargs, + input_data=self.input_data, ) - config = keras.saving.serialize_keras_object(original) - restored = keras.saving.deserialize_keras_object(config) - self.assertEqual(restored.get_config(), original.get_config()) - - @pytest.mark.large # Saving is slow, so mark these large. - def test_saving_model(self): - model_output = self.classifier.predict(self.raw_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.classifier.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, XLMRobertaClassifier) - # Check that output matches. - restored_output = restored_model.predict(self.raw_batch) - self.assertAllClose(model_output, restored_output) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in XLMRobertaClassifier.presets: + self.run_preset_test( + cls=XLMRobertaClassifier, + preset=preset, + init_kwargs={"num_classes": 2}, + input_data=self.input_data, + expected_output_shape=(2, 2), + ) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py index 2a8750c583..a26905e9e3 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py @@ -137,18 +137,27 @@ def __init__( truncate=truncate, **kwargs, ) - + self.mask_selection_rate = mask_selection_rate + self.mask_selection_length = mask_selection_length + self.mask_token_rate = mask_token_rate + self.random_token_rate = random_token_rate + self.masker = None + + def build(self, input_shape): + super().build(input_shape) + # Defer masker creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.masker = MaskedLMMaskGenerator( - mask_selection_rate=mask_selection_rate, - mask_selection_length=mask_selection_length, - mask_token_rate=mask_token_rate, - random_token_rate=random_token_rate, - vocabulary_size=tokenizer.vocabulary_size(), - mask_token_id=tokenizer.mask_token_id, + mask_selection_rate=self.mask_selection_rate, + mask_selection_length=self.mask_selection_length, + mask_token_rate=self.mask_token_rate, + random_token_rate=self.random_token_rate, + vocabulary_size=self.tokenizer.vocabulary_size(), + mask_token_id=self.tokenizer.mask_token_id, unselectable_token_ids=[ - tokenizer.start_token_id, - tokenizer.end_token_id, - tokenizer.pad_token_id, + self.tokenizer.start_token_id, + self.tokenizer.end_token_id, + self.tokenizer.pad_token_id, ], ) @@ -156,10 +165,10 @@ def get_config(self): config = super().get_config() config.update( { - "mask_selection_rate": self.masker.mask_selection_rate, - "mask_selection_length": self.masker.mask_selection_length, - "mask_token_rate": self.masker.mask_token_rate, - "random_token_rate": self.masker.random_token_rate, + "mask_selection_rate": self.mask_selection_rate, + "mask_selection_length": self.mask_selection_length, + "mask_token_rate": self.mask_token_rate, + "random_token_rate": self.random_token_rate, } ) return config diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py index cbb74c7722..c1bfc7242a 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KerasNLP Authors +# Copyright 2023 The KerasNLP Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.xlm_roberta.xlm_roberta_masked_lm_preprocessor import ( XLMRobertaMaskedLMPreprocessor, ) @@ -29,120 +27,65 @@ class XLMRobertaMaskedLMPreprocessorTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + self.tokenizer = XLMRobertaTokenizer( + # Generated using create_xlm_roberta_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="", - eos_piece="", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - - self.tokenizer = XLMRobertaTokenizer(proto=self.proto) - self.preprocessor = XLMRobertaMaskedLMPreprocessor( - tokenizer=self.tokenizer, + self.init_kwargs = { + "tokenizer": self.tokenizer, # Simplify our testing by masking every available token. - mask_selection_rate=1.0, - mask_token_rate=1.0, - random_token_rate=0.0, - mask_selection_length=5, - sequence_length=12, - ) - - def test_preprocess_strings(self): - input_data = " brown fox quick" - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [0, 13, 13, 13, 2, 1, 1, 1, 1, 1, 1, 1] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [1, 2, 3, 0, 0]) - self.assertAllEqual(y, [7, 9, 11, 0, 0]) - self.assertAllEqual(sw, [1.0, 1.0, 1.0, 0.0, 0.0]) - - def test_preprocess_list_of_strings(self): - input_data = [" brown fox quick"] * 13 - - x, y, sw = self.preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [[0, 13, 13, 13, 2, 1, 1, 1, 1, 1, 1, 1]] * 13 - ) - self.assertAllEqual( - x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]] * 13 + "mask_selection_rate": 1.0, + "mask_token_rate": 1.0, + "random_token_rate": 0.0, + "mask_selection_length": 4, + "sequence_length": 12, + } + self.input_data = ["the quick brown fox"] + + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=XLMRobertaMaskedLMPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[0, 13, 13, 13, 13, 2, 1, 1, 1, 1, 1, 1]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[1, 2, 3, 4]], + }, + [[6, 11, 7, 9]], + [[1.0, 1.0, 1.0, 1.0]], + ), ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 0, 0]] * 13) - self.assertAllEqual(y, [[7, 9, 11, 0, 0]] * 13) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 0.0, 0.0]] * 13) - - def test_preprocess_dataset(self): - sentences = tf.constant([" brown fox quick"] * 13) - ds = tf.data.Dataset.from_tensor_slices(sentences) - ds = ds.map(self.preprocessor) - x, y, sw = ds.batch(13).take(1).get_single_element() - self.assertAllEqual( - x["token_ids"], [[0, 13, 13, 13, 2, 1, 1, 1, 1, 1, 1, 1]] * 13 - ) - self.assertAllEqual( - x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]] * 13 - ) - self.assertAllEqual(x["mask_positions"], [[1, 2, 3, 0, 0]] * 13) - self.assertAllEqual(y, [[7, 9, 11, 0, 0]] * 13) - self.assertAllEqual(sw, [[1.0, 1.0, 1.0, 0.0, 0.0]] * 13) - - def test_mask_multiple_sentences(self): - sentence_one = tf.constant(" airplane") - sentence_two = tf.constant(" round") - - x, y, sw = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - x["token_ids"], [0, 2, 2, 2, 13, 2, 1, 1, 1, 1, 1, 1] - ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [4, 0, 0, 0, 0]) - self.assertAllEqual(y, [12, 0, 0, 0, 0]) - self.assertAllEqual(sw, [1.0, 0.0, 0.0, 0.0, 0.0]) def test_no_masking_zero_rate(self): no_mask_preprocessor = XLMRobertaMaskedLMPreprocessor( - self.preprocessor.tokenizer, + self.tokenizer, mask_selection_rate=0.0, - mask_selection_length=5, + mask_selection_length=4, sequence_length=12, ) - input_data = " quick brown fox" - - x, y, sw = no_mask_preprocessor(input_data) - self.assertAllEqual( - x["token_ids"], [0, 11, 7, 9, 2, 1, 1, 1, 1, 1, 1, 1] + input_data = ["the quick brown fox"] + self.assertAllClose( + no_mask_preprocessor(input_data), + ( + { + "token_ids": [[0, 6, 11, 7, 9, 2, 1, 1, 1, 1, 1, 1]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], + "mask_positions": [[0, 0, 0, 0]], + }, + [[0, 0, 0, 0]], + [[0.0, 0.0, 0.0, 0.0]], + ), ) - self.assertAllEqual( - x["padding_mask"], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0] - ) - self.assertAllEqual(x["mask_positions"], [0, 0, 0, 0, 0]) - self.assertAllEqual(y, [0, 0, 0, 0, 0]) - self.assertAllEqual(sw, [0.0, 0.0, 0.0, 0.0, 0.0]) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in XLMRobertaMaskedLMPreprocessor.presets: + self.run_preset_test( + cls=XLMRobertaMaskedLMPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py index 94c06ea7be..d9a1ce68f1 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KerasNLP Authors +# Copyright 2023 The KerasNLP Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,14 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os import pytest -import sentencepiece -import tensorflow as tf -from keras_nlp.backend import keras from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone from keras_nlp.models.xlm_roberta.xlm_roberta_masked_lm import ( XLMRobertaMaskedLM, @@ -35,29 +31,14 @@ class XLMRobertaMaskedLMTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the slow brown fox"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=5, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="", - eos_piece="", - user_defined_symbols="[MASK]", - ) - self.proto = bytes_io.getvalue() - + # Setup model. self.preprocessor = XLMRobertaMaskedLMPreprocessor( - XLMRobertaTokenizer(proto=self.proto), + XLMRobertaTokenizer( + # Generated using create_xlm_roberta_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) + ), # Simplify our testing by masking every available token. mask_selection_rate=1.0, mask_token_rate=1.0, @@ -65,70 +46,44 @@ def setUp(self): mask_selection_length=5, sequence_length=5, ) - self.backbone = XLMRobertaBackbone( vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(), num_layers=2, num_heads=2, hidden_dim=2, intermediate_dim=4, - max_sequence_length=self.preprocessor.packer.sequence_length, + max_sequence_length=self.preprocessor.sequence_length, ) - - self.masked_lm = XLMRobertaMaskedLM( - self.backbone, - preprocessor=self.preprocessor, + self.init_kwargs = { + "preprocessor": self.preprocessor, + "backbone": self.backbone, + } + self.train_data = ( + ["the quick brown fox.", "the earth is round"], # Features. ) - - self.raw_batch = [ - "the quick brown fox", - "the slow brown fox", - ] - self.preprocessed_batch = self.preprocessor(self.raw_batch)[0] - self.raw_dataset = tf.data.Dataset.from_tensor_slices( - self.raw_batch - ).batch(2) - self.preprocessed_dataset = self.raw_dataset.map(self.preprocessor) - - def test_valid_call_masked_lm(self): - self.masked_lm(self.preprocessed_batch) - - def test_classifier_predict(self): - self.masked_lm.predict(self.raw_batch) - self.masked_lm.preprocessor = None - self.masked_lm.predict(self.preprocessed_batch) - - def test_classifier_fit(self): - self.masked_lm.fit(self.raw_dataset) - self.masked_lm.preprocessor = None - self.masked_lm.fit(self.preprocessed_dataset) - - def test_classifier_fit_no_xla(self): - self.masked_lm.preprocessor = None - self.masked_lm.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), - jit_compile=False, - ) - self.masked_lm.fit(self.preprocessed_dataset) - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.masked_lm) - new_classifier = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_classifier.get_config(), - self.masked_lm.get_config(), + self.input_data = self.preprocessor(*self.train_data)[0] + + def test_masked_lm_basics(self): + self.run_task_test( + cls=XLMRobertaMaskedLM, + init_kwargs=self.init_kwargs, + train_data=self.train_data, + expected_output_shape=(2, 5, 14), ) @pytest.mark.large def test_saved_model(self): - save_path = os.path.join(self.get_temp_dir(), "model.keras") - self.masked_lm.save(save_path, save_format="keras_v3") - restored_model = keras.models.load_model(save_path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, XLMRobertaMaskedLM) - - model_output = self.masked_lm(self.preprocessed_batch) - restored_output = restored_model(self.preprocessed_batch) + self.run_model_saving_test( + cls=XLMRobertaMaskedLM, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) - self.assertAllClose(model_output, restored_output) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in XLMRobertaMaskedLM.presets: + self.run_preset_test( + cls=XLMRobertaMaskedLM, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py index e557bd0635..23b48073f7 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py @@ -156,22 +156,29 @@ def __init__( super().__init__(**kwargs) self.tokenizer = tokenizer + self.truncate = truncate + self.sequence_length = sequence_length + self.packer = None + def build(self, input_shape): + # Defer packer creation to `build()` so that we can be sure tokenizer + # assets have loaded when restoring a saved model. self.packer = MultiSegmentPacker( start_value=self.tokenizer.start_token_id, end_value=self.tokenizer.end_token_id, sep_value=[self.tokenizer.end_token_id] * 2, pad_value=self.tokenizer.pad_token_id, - truncate=truncate, - sequence_length=sequence_length, + truncate=self.truncate, + sequence_length=self.sequence_length, ) + self.built = True def get_config(self): config = super().get_config() config.update( { - "sequence_length": self.packer.sequence_length, - "truncate": self.packer.truncate, + "sequence_length": self.sequence_length, + "truncate": self.truncate, } ) return config diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py index 711b52e264..3c3bbf2612 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import ( XLMRobertaPreprocessor, ) @@ -29,110 +27,48 @@ class XLMRobertaPreprocessorTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + self.tokenizer = XLMRobertaTokenizer( + # Generated using create_xlm_roberta_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) + ) + self.init_kwargs = { + "tokenizer": self.tokenizer, + "sequence_length": 8, + } + self.input_data = ( + ["the quick brown fox"], + [1], # Pass through labels. + [1.0], # Pass through sample_weights. ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=10, - model_type="WORD", - unk_id=0, - bos_id=1, - eos_id=2, - ) - self.proto = bytes_io.getvalue() - - self.preprocessor = XLMRobertaPreprocessor( - tokenizer=XLMRobertaTokenizer(proto=self.proto), - sequence_length=12, - ) - - def test_tokenize_strings(self): - input_data = "the quick brown fox" - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], [0, 4, 9, 5, 7, 2, 1, 1, 1, 1, 1, 1] - ) - self.assertAllEqual( - output["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] - ) - - def test_tokenize_list_of_strings(self): - # We should handle a list of strings as as batch. - input_data = ["the quick brown fox"] * 4 - output = self.preprocessor(input_data) - self.assertAllEqual( - output["token_ids"], [[0, 4, 9, 5, 7, 2, 1, 1, 1, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - - def test_tokenize_labeled_batch(self): - x = tf.constant(["the quick brown fox"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - x_out, y_out, sw_out = self.preprocessor(x, y, sw) - self.assertAllEqual( - x_out["token_ids"], [[0, 4, 9, 5, 7, 2, 1, 1, 1, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - - def test_tokenize_labeled_dataset(self): - x = tf.constant(["the quick brown fox"] * 4) - y = tf.constant([1] * 4) - sw = tf.constant([1.0] * 4) - ds = tf.data.Dataset.from_tensor_slices((x, y, sw)) - ds = ds.map(self.preprocessor) - x_out, y_out, sw_out = ds.batch(4).take(1).get_single_element() - self.assertAllEqual( - x_out["token_ids"], [[0, 4, 9, 5, 7, 2, 1, 1, 1, 1, 1, 1]] * 4 - ) - self.assertAllEqual( - x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 - ) - self.assertAllEqual(y_out, y) - self.assertAllEqual(sw_out, sw) - def test_tokenize_multiple_sentences(self): - sentence_one = tf.constant("the quick brown fox") - sentence_two = tf.constant("the earth") - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], [0, 4, 9, 5, 7, 2, 2, 4, 6, 2, 1, 1] - ) - self.assertAllEqual( - output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0] - ) - - def test_tokenize_multiple_batched_sentences(self): - sentence_one = tf.constant(["the quick brown fox"] * 4) - sentence_two = tf.constant(["the earth"] * 4) - # The first tuple or list is always interpreted as an enumeration of - # separate sequences to concatenate. - output = self.preprocessor((sentence_one, sentence_two)) - self.assertAllEqual( - output["token_ids"], [[0, 4, 9, 5, 7, 2, 2, 4, 6, 2, 1, 1]] * 4 - ) - self.assertAllEqual( - output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 4 + def test_preprocessor_basics(self): + self.run_preprocessing_layer_test( + cls=XLMRobertaPreprocessor, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=( + { + "token_ids": [[0, 6, 11, 7, 9, 2, 1, 1]], + "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]], + }, + [1], # Pass through labels. + [1.0], # Pass through sample_weights. + ), ) def test_errors_for_2d_list_input(self): + preprocessor = XLMRobertaPreprocessor(**self.init_kwargs) ambiguous_input = [["one", "two"], ["three", "four"]] with self.assertRaises(ValueError): - self.preprocessor(ambiguous_input) + preprocessor(ambiguous_input) - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.preprocessor) - new_preprocessor = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_preprocessor.get_config(), - self.preprocessor.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in XLMRobertaPreprocessor.presets: + self.run_preset_test( + cls=XLMRobertaPreprocessor, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_presets.py b/keras_nlp/models/xlm_roberta/xlm_roberta_presets.py index 350c069f1d..477e508906 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_presets.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_presets.py @@ -25,20 +25,7 @@ "path": "xlm_roberta", "model_card": "https://github.com/facebookresearch/fairseq/blob/main/examples/xlmr/README.md", }, - "config": { - "vocabulary_size": 250002, - "num_layers": 12, - "num_heads": 12, - "hidden_dim": 768, - "intermediate_dim": 3072, - "dropout": 0.1, - "max_sequence_length": 512, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/xlm_roberta_base_multi/v1/model.h5", - "weights_hash": "2eb6fcda5a42f0a88056213ba3d93906", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/xlm_roberta_base_multi/v1/vocab.spm", - "spm_proto_hash": "bf25eb5120ad92ef5c7d8596b5dc4046", + "kaggle_handle": "kaggle://keras/xlm_roberta/keras/xlm_roberta_base_multi/2", }, "xlm_roberta_large_multi": { "metadata": { @@ -51,19 +38,6 @@ "path": "xlm_roberta", "model_card": "https://github.com/facebookresearch/fairseq/blob/main/examples/xlmr/README.md", }, - "config": { - "vocabulary_size": 250002, - "num_layers": 24, - "num_heads": 16, - "hidden_dim": 1024, - "intermediate_dim": 4096, - "dropout": 0.1, - "max_sequence_length": 512, - }, - "preprocessor_config": {}, - "weights_url": "https://storage.googleapis.com/keras-nlp/models/xlm_roberta_large_multi/v1/model.h5", - "weights_hash": "276211827174b71751f2ce3a89da503a", - "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/xlm_roberta_large_multi/v1/vocab.spm", - "spm_proto_hash": "bf25eb5120ad92ef5c7d8596b5dc4046", + "kaggle_handle": "kaggle://keras/xlm_roberta/keras/xlm_roberta_large_multi/2", }, } diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_presets_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_presets_test.py deleted file mode 100644 index b74de26b01..0000000000 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_presets_test.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from absl.testing import parameterized - -from keras_nlp.backend import ops -from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone -from keras_nlp.models.xlm_roberta.xlm_roberta_classifier import ( - XLMRobertaClassifier, -) -from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import ( - XLMRobertaPreprocessor, -) -from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import ( - XLMRobertaTokenizer, -) -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.large -@pytest.mark.tf_only # TODO: jax OOM. -class XLMRobertaPresetSmokeTest(TestCase): - """ - A smoke test for XLM-RoBERTa presets we run continuously. - - This only tests the smallest weights we have available. Run with: - `pytest keras_nlp/models/xlm_roberta/xlm_roberta_presets_test.py --run_large` - """ - - def test_tokenizer_output(self): - tokenizer = XLMRobertaTokenizer.from_preset( - "xlm_roberta_base_multi", - ) - outputs = tokenizer("The quick brown fox.") - expected_outputs = [581, 63773, 119455, 6, 147797, 5] - self.assertAllEqual(outputs, expected_outputs) - - def test_preprocessor_output(self): - preprocessor = XLMRobertaPreprocessor.from_preset( - "xlm_roberta_base_multi", - sequence_length=4, - ) - outputs = preprocessor("The quick brown fox.")["token_ids"] - expected_outputs = [0, 581, 63773, 2] - self.assertAllEqual(outputs, expected_outputs) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_backbone_output(self, load_weights): - input_data = { - "token_ids": ops.array([[0, 581, 63773, 2]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = XLMRobertaBackbone.from_preset( - "xlm_roberta_base_multi", load_weights=load_weights - ) - outputs = model(input_data) - if load_weights: - outputs = outputs[0, 0, :5] - expected = [0.084763, 0.097018, 0.051329, -0.000805, 0.028415] - self.assertAllClose(outputs, expected, atol=0.01, rtol=0.01) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_classifier_output(self, load_weights): - input_data = ["The quick brown fox."] - model = XLMRobertaClassifier.from_preset( - "xlm_roberta_base_multi", num_classes=2, load_weights=load_weights - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_classifier_output_without_preprocessing(self, load_weights): - input_data = { - "token_ids": ops.array([[0, 581, 63773, 2]]), - "padding_mask": ops.array([[1, 1, 1, 1]]), - } - model = XLMRobertaClassifier.from_preset( - "xlm_roberta_base_multi", - num_classes=2, - load_weights=load_weights, - preprocessor=None, - ) - # Never assert output values, as the head weights are random. - model.predict(input_data) - - @parameterized.named_parameters( - ("xlm_roberta_tokenizer", XLMRobertaTokenizer), - ("xlm_roberta_preprocessor", XLMRobertaPreprocessor), - ("xlm_roberta", XLMRobertaBackbone), - ("xlm_roberta_classifier", XLMRobertaClassifier), - ) - def test_preset_docstring(self, cls): - """Check we did our docstring formatting correctly.""" - for name in cls.presets: - self.assertRegex(cls.from_preset.__doc__, name) - - @parameterized.named_parameters( - ("xlm_roberta_tokenizer", XLMRobertaTokenizer, {}), - ("xlm_roberta_preprocessor", XLMRobertaPreprocessor, {}), - ("xlm_roberta", XLMRobertaBackbone, {}), - ("xlm_roberta_classifier", XLMRobertaClassifier, {"num_classes": 2}), - ) - def test_unknown_preset_error(self, cls, kwargs): - # Not a preset name - with self.assertRaises(ValueError): - cls.from_preset("xlm_roberta_base_clowntown", **kwargs) - - -@pytest.mark.extra_large -class XLMRobertaPresetFullTest(TestCase): - """ - Test the full enumeration of our preset. - - This tests every XLM-RoBERTa preset and is only run manually. - Run with: - `pytest keras_nlp/models/xlm_roberta/xlm_roberta_presets_test.py --run_extra_large` - """ - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_xlm_roberta(self, load_weights): - for preset in XLMRobertaBackbone.presets: - model = XLMRobertaBackbone.from_preset( - preset, load_weights=load_weights - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), dtype="int64", maxval=model.vocabulary_size - ), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - model(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_xlm_roberta_classifier(self, load_weights): - for preset in XLMRobertaClassifier.presets: - classifier = XLMRobertaClassifier.from_preset( - preset, - num_classes=4, - load_weights=load_weights, - ) - input_data = ["The quick brown fox."] - classifier.predict(input_data) - - @parameterized.named_parameters( - ("preset_weights", True), ("random_weights", False) - ) - def test_load_xlm_roberta_classifier_without_preprocessing( - self, load_weights - ): - for preset in XLMRobertaClassifier.presets: - classifier = XLMRobertaClassifier.from_preset( - preset, - num_classes=4, - load_weights=load_weights, - preprocessor=None, - ) - input_data = { - "token_ids": ops.random.uniform( - shape=(1, 512), - dtype="int64", - maxval=classifier.backbone.vocabulary_size, - ), - "padding_mask": ops.array([1] * 512, shape=(1, 512)), - } - classifier.predict(input_data) - - def test_load_tokenizers(self): - for preset in XLMRobertaTokenizer.presets: - tokenizer = XLMRobertaTokenizer.from_preset(preset) - tokenizer("The quick brown fox.") - - def test_load_preprocessors(self): - for preset in XLMRobertaPreprocessor.presets: - preprocessor = XLMRobertaPreprocessor.from_preset(preset) - preprocessor("The quick brown fox.") diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py index c0b91abb3c..576f30bca1 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py @@ -88,8 +88,6 @@ def train_sentencepiece(ds, vocab_size): """ def __init__(self, proto, **kwargs): - super().__init__(proto=proto, **kwargs) - # List of special tokens. self._vocabulary_prefix = ["", "", "", ""] @@ -98,7 +96,15 @@ def __init__(self, proto, **kwargs): self.pad_token_id = 1 # self.end_token_id = 2 # self.unk_token_id = 3 # - self.mask_token_id = self.vocabulary_size() - 1 # + + super().__init__(proto=proto, **kwargs) + + def set_proto(self, proto): + super().set_proto(proto) + if proto is not None: + self.mask_token_id = self.vocabulary_size() - 1 + else: + self.mask_token_id = None def vocabulary_size(self): """Get the size of the tokenizer vocabulary.""" @@ -106,6 +112,7 @@ def vocabulary_size(self): def get_vocabulary(self): """Get the size of the tokenizer vocabulary.""" + self._check_vocabulary() vocabulary = tensor_to_list( self._sentence_piece.id_to_string( tf.range(super().vocabulary_size()) @@ -115,6 +122,7 @@ def get_vocabulary(self): def id_to_token(self, id): """Convert an integer id to a string token.""" + self._check_vocabulary() if id == self.mask_token_id: return "" @@ -132,6 +140,7 @@ def id_to_token(self, id): def token_to_id(self, token): """Convert a string token to an integer id.""" + self._check_vocabulary() if token in self._vocabulary_prefix: return self._vocabulary_prefix.index(token) @@ -146,6 +155,7 @@ def token_to_id(self, token): return int(spm_token_id.numpy()) + 1 def tokenize(self, inputs): + self._check_vocabulary() tokens = super().tokenize(inputs) # Correct `unk_token_id` (0 -> 3). Note that we do not correct @@ -156,9 +166,27 @@ def tokenize(self, inputs): # Shift the tokens IDs right by one. return tf.add(tokens, 1) - def detokenize(self, ids): - ids = tf.ragged.boolean_mask(ids, tf.not_equal(ids, self.mask_token_id)) - return super().detokenize(ids) + def detokenize(self, inputs): + self._check_vocabulary() + tokens = tf.ragged.boolean_mask( + inputs, tf.not_equal(inputs, self.mask_token_id) + ) + + # Shift the tokens IDs left by one. + tokens = tf.subtract(tokens, 1) + + # Correct `unk_token_id`, `end_token_id`, `start_token_id`, respectively. + # Note: The `pad_token_id` is taken as 0 (`unk_token_id`) since the + # proto does not contain `pad_token_id`. This mapping of the pad token + # is done automatically by the above subtraction. + tokens = tf.where(tf.equal(tokens, self.unk_token_id - 1), 0, tokens) + tokens = tf.where(tf.equal(tokens, self.end_token_id - 1), 2, tokens) + tokens = tf.where(tf.equal(tokens, self.start_token_id - 1), 1, tokens) + + # Note: Even though we map `"" and `""` to the correct IDs, + # the `detokenize` method will return empty strings for these tokens. + # This is a vagary of the `sentencepiece` library. + return super().detokenize(tokens) @classproperty def presets(cls): diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py index 7800e3913c..2057eff9eb 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os -import sentencepiece -import tensorflow as tf +import pytest -from keras_nlp.backend import keras from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import ( XLMRobertaTokenizer, ) @@ -26,88 +24,36 @@ class XLMRobertaTokenizerTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] + self.init_kwargs = { + # Generated using create_xlm_roberta_test_proto.py + "proto": os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) + } + self.input_data = ["the quick brown fox", "the earth is round"] + + def test_tokenizer_basics(self): + self.run_preprocessing_layer_test( + cls=XLMRobertaTokenizer, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output=[[6, 11, 7, 9], [6, 8, 10, 12]], ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=10, - model_type="WORD", - unk_id=0, - bos_id=1, - eos_id=2, - ) - self.proto = bytes_io.getvalue() - - self.tokenizer = XLMRobertaTokenizer(proto=self.proto) - - def test_tokenize(self): - input_data = "the quick brown fox" - output = self.tokenizer(input_data) - self.assertAllEqual(output, [4, 9, 5, 7]) - - def test_tokenize_batch(self): - input_data = ["the quick brown fox", "the earth is round"] - output = self.tokenizer(input_data) - self.assertAllEqual(output, [[4, 9, 5, 7], [4, 6, 8, 10]]) - - def test_unk_token(self): - input_data = "the quick brown fox running" - - output = self.tokenizer(input_data) - self.assertAllEqual(output, [4, 9, 5, 7, 3]) - def test_detokenize(self): - input_data = [[4, 9, 5, 7]] - output = self.tokenizer.detokenize(input_data) - self.assertEqual(output, ["brown round earth is"]) - - def test_vocabulary(self): - vocabulary = self.tokenizer.get_vocabulary() - self.assertAllEqual( - vocabulary, - [ - "", - "", - "", - "", - "▁the", - "▁brown", - "▁earth", - "▁fox", - "▁is", - "▁quick", - "▁round", - "", - ], + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=XLMRobertaTokenizer, + preset="xlm_roberta_base_multi", + input_data=["The quick brown fox."], + expected_output=[[581, 63773, 119455, 6, 147797, 5]], ) - self.assertEqual(self.tokenizer.vocabulary_size(), 12) - - def test_id_to_token(self): - print(self.tokenizer.id_to_token(9)) - self.assertEqual(self.tokenizer.id_to_token(9), "▁quick") - self.assertEqual(self.tokenizer.id_to_token(5), "▁brown") - - def test_error_id_out_of_vocabulary(self): - with self.assertRaises(ValueError): - self.tokenizer.id_to_token(self.tokenizer.vocabulary_size()) - with self.assertRaises(ValueError): - self.tokenizer.id_to_token(-1) - def test_token_to_id(self): - self.assertEqual(self.tokenizer.token_to_id("▁the"), 4) - self.assertEqual(self.tokenizer.token_to_id("▁round"), 10) - # Test any random OOV token. - self.assertEqual(self.tokenizer.token_to_id(""), 3) - # Test a special token. - self.assertEqual(self.tokenizer.token_to_id(""), 1) - - def test_serialization(self): - config = keras.saving.serialize_keras_object(self.tokenizer) - new_tokenizer = keras.saving.deserialize_keras_object(config) - self.assertEqual( - new_tokenizer.get_config(), - self.tokenizer.get_config(), - ) + @pytest.mark.extra_large + def test_all_presets(self): + for preset in XLMRobertaTokenizer.presets: + self.run_preset_test( + cls=XLMRobertaTokenizer, + preset=preset, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/xlnet/relative_attention.py b/keras_nlp/models/xlnet/relative_attention.py index d11dc4bd5a..a11ae3fd9d 100644 --- a/keras_nlp/models/xlnet/relative_attention.py +++ b/keras_nlp/models/xlnet/relative_attention.py @@ -15,7 +15,6 @@ import math import string -from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.backend import ops @@ -76,7 +75,6 @@ def _rel_shift(x, klen=-1): return x -@keras_nlp_export("keras_nlp.layers.TwoStreamRelativeAttention") class TwoStreamRelativeAttention(keras.layers.MultiHeadAttention): """Two-stream relative self-attention for XLNet. diff --git a/keras_nlp/models/xlnet/xlnet_backbone.py b/keras_nlp/models/xlnet/xlnet_backbone.py index 1d1b4d2343..fb196233c9 100644 --- a/keras_nlp/models/xlnet/xlnet_backbone.py +++ b/keras_nlp/models/xlnet/xlnet_backbone.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.models.backbone import Backbone from keras_nlp.models.xlnet.xlnet_content_and_query_embedding import ( @@ -23,7 +22,7 @@ from keras_nlp.models.xlnet.xlnet_encoder import XLNetSegmentMatrixLayer -@keras_nlp_export("keras_nlp.models.XLNetBackbone") +@keras.saving.register_keras_serializable(package="keras_nlp") class XLNetBackbone(Backbone): """XLNet encoder network. diff --git a/keras_nlp/models/xlnet/xlnet_backbone_test.py b/keras_nlp/models/xlnet/xlnet_backbone_test.py index f8cdc3e7be..f2faf4cdd9 100644 --- a/keras_nlp/models/xlnet/xlnet_backbone_test.py +++ b/keras_nlp/models/xlnet/xlnet_backbone_test.py @@ -12,99 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - -import numpy as np import pytest -import tensorflow as tf -from keras_nlp.backend import keras +from keras_nlp.backend import ops from keras_nlp.models.xlnet.xlnet_backbone import XLNetBackbone from keras_nlp.tests.test_case import TestCase class XLNetTest(TestCase): def setUp(self): - self.backbone = XLNetBackbone( - vocabulary_size=10, - num_layers=2, - num_heads=2, - hidden_dim=2, - intermediate_dim=4, - name="xlnet_backbone", - ) - - self.input_batch = { - "token_ids": np.ones((2, 7), dtype=np.int32), - "segment_ids": np.ones((2, 7), dtype=np.int32), - "padding_mask": np.ones((2, 7), dtype=np.int32), + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "num_heads": 2, + "hidden_dim": 2, + "intermediate_dim": 4, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "segment_ids": ops.zeros((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), } - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_call(self): - self.backbone(self.input_batch) - - def test_token_embedding(self): - output = self.backbone.token_embedding(self.input_batch["token_ids"]) - self.assertEqual(output.shape, (2, 7, 2)) - - def test_variable_sequence_length(self): - for seq_length in (20, 30, 40): - input_data = { - "token_ids": np.ones((2, seq_length), dtype=np.int32), - "padding_mask": np.ones((2, seq_length), dtype=np.int32), - "segment_ids": np.ones((2, seq_length), dtype=np.int32), - } - self.backbone(input_data) - - def test_predict(self): - self.backbone.predict(self.input_batch) - self.backbone.predict(self.input_dataset) - def test_serialization(self): - new_backbone = keras.saving.deserialize_keras_object( - keras.saving.serialize_keras_object(self.backbone) + def test_backbone_basics(self): + self.run_backbone_test( + cls=XLNetBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 2), ) - self.assertEqual(new_backbone.get_config(), self.backbone.get_config()) + @pytest.mark.large def test_saved_model(self): - model_output = self.backbone(self.input_batch) - path = os.path.join(self.get_temp_dir(), "model.keras") - self.backbone.save(path, save_format="keras_v3") - restored_model = keras.models.load_model(path) - - # Check we got the real object back. - self.assertIsInstance(restored_model, XLNetBackbone) - - # Check that output matches. - restored_output = restored_model(self.input_batch) - self.assertAllClose(model_output, restored_output) - - -@pytest.mark.tpu -@pytest.mark.usefixtures("tpu_test_class") -class XLNetTPUTest(TestCase): - def setUp(self): - with self.tpu_strategy.scope(): - self.backbone = XLNetBackbone( - vocabulary_size=1000, - num_layers=2, - num_heads=2, - hidden_dim=64, - intermediate_dim=128, - ) - self.input_batch = { - "token_ids": np.ones((2, 7), dtype=np.int32), - "padding_mask": np.ones((2, 7), dtype=np.int32), - "segment_ids": np.ones((2, 7), dtype=np.int32), - } - - self.input_dataset = tf.data.Dataset.from_tensor_slices( - self.input_batch - ).batch(2) - - def test_predict(self): - self.backbone.compile() - self.backbone.predict(self.input_dataset) + self.run_model_saving_test( + cls=XLNetBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) diff --git a/keras_nlp/models/xlnet/xlnet_encoder.py b/keras_nlp/models/xlnet/xlnet_encoder.py index 13f5a953ee..bb8e56e4cc 100644 --- a/keras_nlp/models/xlnet/xlnet_encoder.py +++ b/keras_nlp/models/xlnet/xlnet_encoder.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import keras from keras_nlp.backend import ops from keras_nlp.models.xlnet.relative_attention import TwoStreamRelativeAttention @@ -22,7 +21,6 @@ def xlnet_kernel_initializer(stddev=0.02): return keras.initializers.TruncatedNormal(stddev=stddev) -@keras_nlp_export("keras_nlp.layers.XLNetEncoder") class XLNetEncoder(keras.layers.Layer): """ XLNet Encoder. diff --git a/keras_nlp/tests/test_case.py b/keras_nlp/tests/test_case.py index d0af5a47a4..455a8569b7 100644 --- a/keras_nlp/tests/test_case.py +++ b/keras_nlp/tests/test_case.py @@ -13,16 +13,20 @@ # limitations under the License. import json +import os +import pathlib +import re import tensorflow as tf import tree from absl.testing import parameterized -from keras_core.backend import is_float_dtype -from keras_core.backend import standardize_dtype from keras_nlp.backend import config from keras_nlp.backend import keras from keras_nlp.backend import ops +from keras_nlp.tokenizers.tokenizer import Tokenizer +from keras_nlp.utils.tensor_utils import is_float_dtype +from keras_nlp.utils.tensor_utils import standardize_dtype def convert_to_comparible_type(x): @@ -39,7 +43,7 @@ def convert_to_comparible_type(x): return tree.map_structure(lambda x: x.decode("utf-8"), x) if isinstance(x, (tf.Tensor, tf.RaggedTensor)): return x - if ops.is_tensor(x): + if hasattr(x, "__array__"): return ops.convert_to_numpy(x) return x @@ -74,7 +78,7 @@ def assertDTypeEqual(self, x, expected_dtype, msg=None): def run_layer_test( self, - layer_cls, + cls, init_kwargs, input_data, expected_output_shape, @@ -85,9 +89,10 @@ def run_layer_test( run_training_check=True, run_mixed_precision_check=True, ): + """Run basic tests for a modeling layer.""" # Serialization test. - layer = layer_cls(**init_kwargs) - self.run_class_serialization_test(layer) + layer = cls(**init_kwargs) + self.run_serialization_test(layer) def run_build_asserts(layer): self.assertTrue(layer.built) @@ -138,12 +143,14 @@ def call(self, x): return self.layer(x) model = TestModel(layer) - model.compile(optimizer="sgd", loss="mse", jit_compile=True) + # Temporarily disable jit compilation on torch backend. + jit_compile = config.backend() != "torch" + model.compile(optimizer="sgd", loss="mse", jit_compile=jit_compile) model.fit(input_data, output_data, verbose=0) - if config.multi_backend(): + if config.keras_3(): # Build test. - layer = layer_cls(**init_kwargs) + layer = cls(**init_kwargs) if isinstance(input_data, dict): shapes = {k + "_shape": v.shape for k, v in input_data.items()} layer.build(**shapes) @@ -155,7 +162,7 @@ def call(self, x): keras_tensor_inputs = tree.map_structure( lambda x: keras.KerasTensor(x.shape, x.dtype), input_data ) - layer = layer_cls(**init_kwargs) + layer = cls(**init_kwargs) if isinstance(keras_tensor_inputs, dict): keras_tensor_outputs = layer(**keras_tensor_inputs) else: @@ -164,7 +171,7 @@ def call(self, x): run_output_asserts(layer, keras_tensor_outputs) # Eager call test and compiled training test. - layer = layer_cls(**init_kwargs) + layer = cls(**init_kwargs) if isinstance(input_data, dict): output_data = layer(**input_data) else: @@ -181,7 +188,7 @@ def call(self, x): run_mixed_precision_check = torch.cuda.is_available() if run_mixed_precision_check: - layer = layer_cls(**{**init_kwargs, "dtype": "mixed_float16"}) + layer = cls(**{**init_kwargs, "dtype": "mixed_float16"}) if isinstance(input_data, dict): output_data = layer(**input_data) else: @@ -193,7 +200,55 @@ def call(self, x): if is_float_dtype(weight.dtype): self.assertDTypeEqual(weight, "float32") - def run_class_serialization_test(self, instance): + def run_preprocessing_layer_test( + self, + cls, + init_kwargs, + input_data, + expected_output=None, + expected_detokenize_output=None, + ): + """Run basic tests for a preprocessing layer.""" + layer = cls(**init_kwargs) + # Check serialization (without a full save). + self.run_serialization_test(layer) + + ds = tf.data.Dataset.from_tensor_slices(input_data) + + # Run with direct call. + if isinstance(input_data, tuple): + # Mimic tf.data unpacking behavior for preprocessing layers. + output = layer(*input_data) + else: + output = layer(input_data) + + # For tokenizers only, also check detokenize. + if isinstance(layer, Tokenizer): + if not expected_detokenize_output: + expected_detokenize_output = input_data + detokenize_output = layer.detokenize(output) + self.assertAllEqual(detokenize_output, expected_detokenize_output) + + # Run with an unbatched dataset. + output_ds = ds.map(layer).ragged_batch(1_000) + self.assertAllClose(output, output_ds.get_single_element()) + + # Run with a batched dataset. + output_ds = ds.batch(1_000).map(layer) + self.assertAllClose(output, output_ds.get_single_element()) + + if expected_output: + self.assertAllClose(output, expected_output) + + def run_serialization_test(self, instance): + """Check idempotency of serialize/deserialize. + + Not this is a much faster test than saving.""" + run_dir_test = True + # Tokenizers will not initialize the tensorflow trackable system after + # clone, leading to some weird errors here. + if config.backend() == "tensorflow" and isinstance(instance, Tokenizer): + run_dir_test = False # get_config roundtrip cls = instance.__class__ cfg = instance.get_config() @@ -203,9 +258,8 @@ def run_class_serialization_test(self, instance): revived_cfg = revived_instance.get_config() revived_cfg_json = json.dumps(revived_cfg, sort_keys=True, indent=4) self.assertEqual(cfg_json, revived_cfg_json) - # Dir tests only work on keras-core. - if config.multi_backend(): - self.assertEqual(ref_dir, dir(revived_instance)) + if run_dir_test: + self.assertEqual(set(ref_dir), set(dir(revived_instance))) # serialization roundtrip serialized = keras.saving.serialize_keras_object(instance) @@ -216,10 +270,167 @@ def run_class_serialization_test(self, instance): revived_cfg = revived_instance.get_config() revived_cfg_json = json.dumps(revived_cfg, sort_keys=True, indent=4) self.assertEqual(cfg_json, revived_cfg_json) - # Dir tests only work on keras-core. - if config.multi_backend(): + if run_dir_test: new_dir = dir(revived_instance)[:] for lst in [ref_dir, new_dir]: if "__annotations__" in lst: lst.remove("__annotations__") - self.assertEqual(ref_dir, new_dir) + self.assertEqual(set(ref_dir), set(new_dir)) + + def run_model_saving_test( + self, + cls, + init_kwargs, + input_data, + ): + """Save and load a model from disk and assert output is unchanged.""" + model = cls(**init_kwargs) + model_output = model(input_data) + path = os.path.join(self.get_temp_dir(), "model.keras") + model.save(path, save_format="keras_v3") + restored_model = keras.models.load_model(path) + + # Check we got the real object back. + self.assertIsInstance(restored_model, cls) + + # Check that output matches. + restored_output = restored_model(input_data) + self.assertAllClose(model_output, restored_output) + + def run_backbone_test( + self, + cls, + init_kwargs, + input_data, + expected_output_shape, + variable_length_data=None, + ): + """Run basic tests for a backbone, including compilation.""" + backbone = cls(**init_kwargs) + # Check serialization (without a full save). + self.run_serialization_test(backbone) + + # Call model eagerly. + output = backbone(input_data) + if isinstance(expected_output_shape, dict): + for key in expected_output_shape: + self.assertEqual(output[key].shape, expected_output_shape[key]) + else: + self.assertEqual(output.shape, expected_output_shape) + + # Check we can embed tokens eagerly. + output = backbone.token_embedding(ops.zeros((2, 3), dtype="int32")) + + # Check variable length sequences. + if variable_length_data is None: + # If no variable length data passed, assume the second axis of all + # inputs is our sequence axis and create it ourselves. + variable_length_data = [ + tree.map_structure(lambda x: x[:, :seq_length, ...], input_data) + for seq_length in (2, 3, 4) + ] + for batch in variable_length_data: + backbone(batch) + + # Check compiled predict function. + backbone.predict(input_data) + # Convert to numpy first, torch GPU tensor -> tf.data will error. + numpy_data = tree.map_structure(ops.convert_to_numpy, input_data) + # Create a dataset. + input_dataset = tf.data.Dataset.from_tensor_slices(numpy_data).batch(2) + backbone.predict(input_dataset) + + # Check name maps to classname. + name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", cls.__name__) + name = re.sub("([a-z])([A-Z])", r"\1_\2", name).lower() + self.assertRegexpMatches(backbone.name, name) + + def run_task_test( + self, + cls, + init_kwargs, + train_data, + expected_output_shape=None, + batch_size=2, + ): + """Run basic tests for a backbone, including compilation.""" + task = cls(**init_kwargs) + # Check serialization (without a full save). + self.run_serialization_test(task) + preprocessor = task.preprocessor + ds = tf.data.Dataset.from_tensor_slices(train_data).batch(batch_size) + x, y, sw = keras.utils.unpack_x_y_sample_weight(train_data) + + # Test predict. + output = task.predict(x) + if expected_output_shape is not None: + output_shape = tree.map_structure(lambda x: x.shape, output) + self.assertAllClose(output_shape, expected_output_shape) + # With a dataset. + output_ds = task.predict(ds) + self.assertAllClose(output, output_ds) + # With split preprocessing. + task.preprocessor = None + output_split = task.predict(ds.map(preprocessor)) + task.preprocessor = preprocessor + self.assertAllClose(output, output_split) + + # Test fit. + task.fit(x, y, sample_weight=sw) + # With a dataset. + task.fit(ds) + # With split preprocessing. + task.preprocessor = None + task.fit(ds.map(preprocessor)) + task.preprocessor = preprocessor + + def run_preset_test( + self, + cls, + preset, + input_data, + init_kwargs={}, + expected_output=None, + expected_output_shape=None, + expected_partial_output=None, + ): + """Run instantiation and a forward pass for a preset.""" + self.assertRegex(cls.from_preset.__doc__, preset) + + with self.assertRaises(Exception): + cls.from_preset("clowntown", **init_kwargs) + + instance = cls.from_preset(preset, **init_kwargs) + + if isinstance(input_data, tuple): + # Mimic tf.data unpacking behavior for preprocessing layers. + output = instance(*input_data) + else: + output = instance(input_data) + + if isinstance(instance, keras.Model): + instance = cls.from_preset( + preset, load_weights=False, **init_kwargs + ) + instance(input_data) + + if expected_output is not None: + self.assertAllClose(output, expected_output) + + if expected_output_shape is not None: + output_shape = tree.map_structure(lambda x: x.shape, output) + self.assertAllClose(output_shape, expected_output_shape) + + if expected_partial_output is not None: + # Allow passing a partial output snippet of the last dimension. + # We want check stability, but the full output would be too long. + def compare(actual, expected): + expected = ops.convert_to_numpy(expected) + self.assertEqual(len(expected.shape), 1) + actual = ops.reshape(actual, (-1,))[: expected.shape[0]] + self.assertAllClose(actual, expected, atol=0.01, rtol=0.01) + + tree.map_structure(compare, output, expected_partial_output) + + def get_test_data_dir(self): + return str(pathlib.Path(__file__).parent / "test_data") diff --git a/keras_nlp/tests/test_data/albert_test_vocab.spm b/keras_nlp/tests/test_data/albert_test_vocab.spm new file mode 100644 index 0000000000..8520ca4919 Binary files /dev/null and b/keras_nlp/tests/test_data/albert_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/deberta_v3_test_vocab.spm b/keras_nlp/tests/test_data/deberta_v3_test_vocab.spm new file mode 100644 index 0000000000..1c4aa4bbb8 Binary files /dev/null and b/keras_nlp/tests/test_data/deberta_v3_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/f_net_test_vocab.spm b/keras_nlp/tests/test_data/f_net_test_vocab.spm new file mode 100644 index 0000000000..8520ca4919 Binary files /dev/null and b/keras_nlp/tests/test_data/f_net_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/mistral_test_vocab.spm b/keras_nlp/tests/test_data/mistral_test_vocab.spm new file mode 100644 index 0000000000..d753476f53 Binary files /dev/null and b/keras_nlp/tests/test_data/mistral_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/no_special_token_vocab.spm b/keras_nlp/tests/test_data/no_special_token_vocab.spm new file mode 100644 index 0000000000..582613aba7 Binary files /dev/null and b/keras_nlp/tests/test_data/no_special_token_vocab.spm differ diff --git a/keras_nlp/tests/test_data/t5_test_vocab.spm b/keras_nlp/tests/test_data/t5_test_vocab.spm new file mode 100644 index 0000000000..ce7ecf0c49 Binary files /dev/null and b/keras_nlp/tests/test_data/t5_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/tokenizer_test_vocab.spm b/keras_nlp/tests/test_data/tokenizer_test_vocab.spm new file mode 100644 index 0000000000..ec895a9d67 Binary files /dev/null and b/keras_nlp/tests/test_data/tokenizer_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/xlm_roberta_test_vocab.spm b/keras_nlp/tests/test_data/xlm_roberta_test_vocab.spm new file mode 100644 index 0000000000..8520ca4919 Binary files /dev/null and b/keras_nlp/tests/test_data/xlm_roberta_test_vocab.spm differ diff --git a/keras_nlp/tokenizers/byte_pair_tokenizer.py b/keras_nlp/tokenizers/byte_pair_tokenizer.py index 6ec140a113..55992a16d7 100644 --- a/keras_nlp/tokenizers/byte_pair_tokenizer.py +++ b/keras_nlp/tokenizers/byte_pair_tokenizer.py @@ -28,13 +28,14 @@ import tensorflow as tf from keras_nlp.api_export import keras_nlp_export -from keras_nlp.backend import keras from keras_nlp.tokenizers import tokenizer +from keras_nlp.utils.preset_utils import check_preset_class +from keras_nlp.utils.preset_utils import load_from_preset from keras_nlp.utils.python_utils import classproperty from keras_nlp.utils.python_utils import format_docstring from keras_nlp.utils.tensor_utils import assert_tf_text_installed from keras_nlp.utils.tensor_utils import convert_to_ragged_batch -from keras_nlp.utils.tensor_utils import is_integer_dtype +from keras_nlp.utils.tensor_utils import is_int_dtype from keras_nlp.utils.tensor_utils import is_string_dtype try: @@ -42,6 +43,10 @@ except ImportError: tf_text = None +VOCAB_FILENAME = "vocabulary.json" +MERGES_FILENAME = "merges.txt" + + # As python and TF handles special spaces differently, we need to # manually handle special spaces during string split. SPECIAL_WHITESPACES = r"\x{a0}\x{2009}\x{202f}\x{3000}" @@ -273,8 +278,8 @@ class BytePairTokenizer(tokenizer.Tokenizer): def __init__( self, - vocabulary, - merges, + vocabulary=None, + merges=None, sequence_length=None, add_prefix_space=False, unsplittable_tokens=None, @@ -283,16 +288,58 @@ def __init__( ) -> None: assert_tf_text_installed(self.__class__.__name__) - if not is_integer_dtype(dtype) and not is_string_dtype(dtype): + if not is_int_dtype(dtype) and not is_string_dtype(dtype): raise ValueError( "Output dtype must be an integer type or a string. " f"Received: dtype={dtype}" ) super().__init__(dtype=dtype, **kwargs) + self.sequence_length = sequence_length + self.add_prefix_space = add_prefix_space + self.unsplittable_tokens = unsplittable_tokens + + # Create byte <=> unicode mapping. This is useful for handling + # whitespace tokens. + byte_list, unicode_list = bytes_to_unicode() + self.byte2unicode = create_static_hashtable( + byte_list, unicode_list, default="" + ) + self.unicode2byte = create_static_hashtable( + unicode_list, byte_list, default="" + ) + + self.set_vocabulary_and_merges(vocabulary, merges) + + def save_assets(self, dir_path): + vocab_path = os.path.join(dir_path, VOCAB_FILENAME) + merges_path = os.path.join(dir_path, MERGES_FILENAME) + with open(vocab_path, "w", encoding="utf-8") as file: + file.write(json.dumps(dict(self.vocabulary))) + with open(merges_path, "w", encoding="utf-8") as file: + for merge in self.merges: + file.write(f"{merge}\n") + + def load_assets(self, dir_path): + vocab_path = os.path.join(dir_path, VOCAB_FILENAME) + merges_path = os.path.join(dir_path, MERGES_FILENAME) + self.set_vocabulary_and_merges(vocab_path, merges_path) + + def set_vocabulary_and_merges(self, vocabulary, merges): + """Set the vocabulary and merge rules from data or files.""" + if vocabulary is None or merges is None: + # Clear vocab related state. + self.vocabulary = None + self.merges = None + self.cache = None + self.id_to_token_map = None + self.token_to_id_map = None + self.merge_ranks_lookup_default = None + self.merge_ranks = None + return if isinstance(vocabulary, str): - with open(vocabulary, "r") as f: + with open(vocabulary, "r", encoding="utf-8") as f: self.vocabulary = json.load(f) elif isinstance(vocabulary, dict): self.vocabulary = vocabulary.copy() @@ -303,7 +350,7 @@ def __init__( f"`type(vocabulary)={type(vocabulary)}`." ) if isinstance(merges, str): - self.merges = [bp.rstrip() for bp in tf.io.gfile.GFile(merges)] + self.merges = [bp.rstrip() for bp in open(merges, encoding="utf-8")] elif isinstance(merges, Iterable): self.merges = list(merges) else: @@ -311,25 +358,14 @@ def __init__( "Merges must be a file path or a list of merge rules. " f"Received: `type(merges)={type(merges)}`" ) - self.sequence_length = sequence_length - self.add_prefix_space = add_prefix_space - self.unsplittable_tokens = unsplittable_tokens - - # Create byte <=> unicode mapping. This is useful for handling - # whitespace tokens. - byte_list, unicode_list = bytes_to_unicode() - self.byte2unicode = create_static_hashtable( - byte_list, unicode_list, default="" - ) - self.unicode2byte = create_static_hashtable( - unicode_list, byte_list, default="" - ) self.cache = BytePairTokenizerCache() - if unsplittable_tokens: + if self.unsplittable_tokens: # Put special tokens into cache, so it won't be further split and # merged. - self.cache.insert(unsplittable_tokens, unsplittable_tokens) + self.cache.insert( + self.unsplittable_tokens, self.unsplittable_tokens + ) # Create mapping between string tokens to int ids, and vice versa. byte_pairs = [x[0] for x in self.vocabulary.items()] @@ -356,10 +392,12 @@ def __init__( def get_vocabulary(self) -> List[str]: """Get the tokenizer vocabulary as a list of strings tokens.""" + self._check_vocabulary() return self.vocabulary.keys() def vocabulary_size(self) -> int: """Get the size of the tokenizer vocabulary.""" + self._check_vocabulary() return len(self.vocabulary) def id_to_token(self, id: int) -> str: @@ -367,6 +405,7 @@ def id_to_token(self, id: int) -> str: # This will be slow, but keep memory usage down compared to building a # dict. Assuming the main use case is looking up a few special tokens # early in the vocab, this should be fine. + self._check_vocabulary() keys = self.get_vocabulary() for token in keys: @@ -376,24 +415,9 @@ def id_to_token(self, id: int) -> str: def token_to_id(self, token: str) -> int: """Convert a string token to an integer id.""" + self._check_vocabulary() return self.vocabulary[token] - def get_config(self): - config = super().get_config() - config.update( - { - # Ideally vocabulary and merge list would be saved as plain text - # assets in the saved model. We have no good way to support - # this currently, so we save the vocabulary in the config. - "vocabulary": self.vocabulary, - "merges": self.merges, - "sequence_length": self.sequence_length, - "add_prefix_space": self.add_prefix_space, - "unsplittable_tokens": self.unsplittable_tokens, - } - ) - return config - @tf.function def _bpe_merge_one_step(self, words, mask): """Perform one step of byte-pair merge.""" @@ -499,7 +523,16 @@ def loop_condition(_, mask): ) return merged_words + def _check_vocabulary(self): + if self.vocabulary is None: + raise ValueError( + "No vocabulary has been set for BytePairTokenizer. Make sure " + "to pass `vocabulary` and `merges` arguments when creating the " + "layer." + ) + def tokenize(self, inputs): + self._check_vocabulary() if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)): inputs = tf.convert_to_tensor(inputs) @@ -560,8 +593,9 @@ def process_unseen_tokens(): return tokens def detokenize(self, inputs): + self._check_vocabulary() inputs, unbatched, _ = convert_to_ragged_batch(inputs) - + inputs = tf.cast(inputs, self.dtype) unicode_text = tf.strings.reduce_join( self.id_to_token_map.lookup(inputs), axis=-1 ) @@ -592,6 +626,17 @@ def _bpe_merge_and_update_cache(self, tokens): ) self.cache.insert(tokens, tokenized_words) + def get_config(self): + config = super().get_config() + config.update( + { + "sequence_length": self.sequence_length, + "add_prefix_space": self.add_prefix_space, + "unsplittable_tokens": self.unsplittable_tokens, + } + ) + return config + @classproperty def presets(cls): return {} @@ -619,42 +664,19 @@ def from_preset( tokenizer.detokenize([5, 6, 7, 8, 9]) ``` """ - - if not cls.presets: - raise NotImplementedError( - "No presets have been created for this class" - ) - - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - vocabulary = keras.utils.get_file( - "vocab.json", - metadata["vocabulary_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["vocabulary_hash"], - ) - merges = keras.utils.get_file( - "merges.txt", - metadata["merges_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["merges_hash"], - ) - - config = metadata["preprocessor_config"] - config.update( - { - "vocabulary": vocabulary, - "merges": merges, - }, + # We support short IDs for official presets, e.g. `"bert_base_en"`. + # Map these to a Kaggle Models handle. + if preset in cls.presets: + preset = cls.presets[preset]["kaggle_handle"] + + config_file = "tokenizer.json" + check_preset_class(preset, cls, config_file=config_file) + return load_from_preset( + preset, + config_file=config_file, + config_overrides=kwargs, ) - return cls.from_config({**config, **kwargs}) - def __init_subclass__(cls, **kwargs): # Use __init_subclass__ to setup a correct docstring for from_preset. super().__init_subclass__(**kwargs) diff --git a/keras_nlp/tokenizers/byte_pair_tokenizer_test.py b/keras_nlp/tokenizers/byte_pair_tokenizer_test.py index d5f7b3762a..00f8f9b87f 100644 --- a/keras_nlp/tokenizers/byte_pair_tokenizer_test.py +++ b/keras_nlp/tokenizers/byte_pair_tokenizer_test.py @@ -164,6 +164,9 @@ def test_config(self): cloned_tokenizer = BytePairTokenizer.from_config( self.tokenizer.get_config() ) + cloned_tokenizer.set_vocabulary_and_merges( + self.tokenizer.vocabulary, self.tokenizer.merges + ) self.assertAllEqual( self.tokenizer(input_data), cloned_tokenizer(input_data), diff --git a/keras_nlp/tokenizers/byte_tokenizer.py b/keras_nlp/tokenizers/byte_tokenizer.py index f6b60bfeaf..3aefc4a01d 100644 --- a/keras_nlp/tokenizers/byte_tokenizer.py +++ b/keras_nlp/tokenizers/byte_tokenizer.py @@ -19,7 +19,7 @@ from keras_nlp.tokenizers import tokenizer from keras_nlp.utils.tensor_utils import assert_tf_text_installed from keras_nlp.utils.tensor_utils import convert_to_ragged_batch -from keras_nlp.utils.tensor_utils import is_integer_dtype +from keras_nlp.utils.tensor_utils import is_int_dtype try: import tensorflow_text as tf_text @@ -165,7 +165,7 @@ def __init__( ): assert_tf_text_installed(self.__class__.__name__) - if not is_integer_dtype(dtype): + if not is_int_dtype(dtype): raise ValueError( "Output dtype must be an integer type. " f"Received: dtype={dtype}" diff --git a/keras_nlp/tokenizers/sentence_piece_tokenizer.py b/keras_nlp/tokenizers/sentence_piece_tokenizer.py index 2308c33506..ae655aceb6 100644 --- a/keras_nlp/tokenizers/sentence_piece_tokenizer.py +++ b/keras_nlp/tokenizers/sentence_piece_tokenizer.py @@ -20,13 +20,14 @@ import tensorflow as tf from keras_nlp.api_export import keras_nlp_export -from keras_nlp.backend import keras from keras_nlp.tokenizers import tokenizer +from keras_nlp.utils.preset_utils import check_preset_class +from keras_nlp.utils.preset_utils import load_from_preset from keras_nlp.utils.python_utils import classproperty from keras_nlp.utils.python_utils import format_docstring from keras_nlp.utils.tensor_utils import assert_tf_text_installed from keras_nlp.utils.tensor_utils import convert_to_ragged_batch -from keras_nlp.utils.tensor_utils import is_integer_dtype +from keras_nlp.utils.tensor_utils import is_int_dtype from keras_nlp.utils.tensor_utils import is_string_dtype from keras_nlp.utils.tensor_utils import tensor_to_list @@ -36,6 +37,9 @@ tf_text = None +VOCAB_FILENAME = "vocabulary.spm" + + @keras_nlp_export("keras_nlp.tokenizers.SentencePieceTokenizer") class SentencePieceTokenizer(tokenizer.Tokenizer): """A SentencePiece tokenizer layer. @@ -106,14 +110,14 @@ def train_sentence_piece_file(ds, path, size): def __init__( self, - proto, + proto=None, sequence_length: int = None, dtype="int32", **kwargs, ) -> None: assert_tf_text_installed(self.__class__.__name__) - if not is_integer_dtype(dtype) and not is_string_dtype(dtype): + if not is_int_dtype(dtype) and not is_string_dtype(dtype): raise ValueError( "Output dtype must be an integer type or a string. " f"Received: dtype={dtype}" @@ -121,6 +125,25 @@ def __init__( super().__init__(dtype=dtype, **kwargs) + self.proto = None + self.sequence_length = sequence_length + self.set_proto(proto) + + def save_assets(self, dir_path): + path = os.path.join(dir_path, VOCAB_FILENAME) + with open(path, "wb") as file: + file.write(self.proto) + + def load_assets(self, dir_path): + path = os.path.join(dir_path, VOCAB_FILENAME) + self.set_proto(path) + + def set_proto(self, proto): + if proto is None: + self.proto = None + self._sentence_piece = None + return + if isinstance(proto, str): # A string could be either a filepath, or a base64 encoded byte # array (which we need for serialization). We will heuristically @@ -134,7 +157,7 @@ def __init__( except binascii.Error: pass if not is_base64: - proto_bytes = tf.io.gfile.GFile(proto, "rb").read() + proto_bytes = open(proto, "rb").read() elif isinstance(proto, bytes): proto_bytes = proto else: @@ -148,18 +171,18 @@ def __init__( model=proto_bytes, out_type=self.compute_dtype, ) - # Keras cannot serialize a bytestring, so we base64 encode the model # byte array as a string for saving. - self.proto = base64.b64encode(proto_bytes).decode("ascii") - self.sequence_length = sequence_length + self.proto = proto_bytes def vocabulary_size(self) -> int: """Get the size of the tokenizer vocabulary.""" + self._check_vocabulary() return int(self._sentence_piece.vocab_size().numpy()) def get_vocabulary(self) -> List[str]: """Get the tokenizer vocabulary.""" + self._check_vocabulary() return tensor_to_list( self._sentence_piece.id_to_string( tf.range(int(self._sentence_piece.vocab_size().numpy())) @@ -168,6 +191,7 @@ def get_vocabulary(self) -> List[str]: def id_to_token(self, id: int) -> str: """Convert an integer id to a string token.""" + self._check_vocabulary() if id >= self.vocabulary_size() or id < 0: raise ValueError( f"`id` must be in range [0, {self.vocabulary_size() - 1}]. " @@ -177,28 +201,40 @@ def id_to_token(self, id: int) -> str: def token_to_id(self, token: str) -> int: """Convert a string token to an integer id.""" + self._check_vocabulary() return int(self._sentence_piece.string_to_id(token).numpy()) def get_config(self): config = super().get_config() config.update( { - # Ideally the model would be saved as a file asset in - # the saved model. We have no good way to support this - # currently, so we save the model string in the config. - "proto": self.proto, + "proto": None, # Save vocabulary via an asset! "sequence_length": self.sequence_length, } ) return config + def _check_vocabulary(self): + if self.proto is None: + raise ValueError( + "No vocabulary has been set for SentencePieceTokenizer. Make " + "sure to pass a `proto` argument when creating the layer." + ) + def tokenize(self, inputs): + self._check_vocabulary() if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)): inputs = tf.convert_to_tensor(inputs) scalar_input = inputs.shape.rank == 0 if scalar_input: inputs = tf.expand_dims(inputs, 0) + if self._sentence_piece is None: + raise ValueError( + "No vocabulary has been set for SentencePieceTokenizer. Make " + "sure to pass a `vocabulary` argument when creating the layer." + ) + tokens = self._sentence_piece.tokenize(inputs) # Convert to a dense output if `sequence_length` is set. @@ -215,6 +251,7 @@ def tokenize(self, inputs): return tokens def detokenize(self, inputs): + self._check_vocabulary() inputs, unbatched, _ = convert_to_ragged_batch(inputs) outputs = self._sentence_piece.detokenize(inputs) if unbatched: @@ -248,35 +285,19 @@ def from_preset( tokenizer.detokenize([5, 6, 7, 8, 9]) ``` """ - - if not cls.presets: - raise NotImplementedError( - "No presets have been created for this class" - ) - - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - spm_proto = keras.utils.get_file( - "vocab.spm", - metadata["spm_proto_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["spm_proto_hash"], - ) - - config = metadata["preprocessor_config"] - config.update( - { - "proto": spm_proto, - }, + # We support short IDs for official presets, e.g. `"bert_base_en"`. + # Map these to a Kaggle Models handle. + if preset in cls.presets: + preset = cls.presets[preset]["kaggle_handle"] + + config_file = "tokenizer.json" + check_preset_class(preset, cls, config_file=config_file) + return load_from_preset( + preset, + config_file=config_file, + config_overrides=kwargs, ) - return cls.from_config({**config, **kwargs}) - def __init_subclass__(cls, **kwargs): # Use __init_subclass__ to setup a correct docstring for from_preset. super().__init_subclass__(**kwargs) diff --git a/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py b/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py index e488f1d0c1..74477cdf03 100644 --- a/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py +++ b/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os -import sentencepiece import tensorflow as tf from keras_nlp.tests.test_case import TestCase @@ -25,17 +23,9 @@ class SentencePieceTokenizerTest(TestCase): def setUp(self): super().setUp() - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox."] + self.proto = os.path.join( + self.get_test_data_dir(), "tokenizer_test_vocab.spm" ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=7, - model_type="WORD", - ) - self.proto = bytes_io.getvalue() def test_tokenize(self): input_data = ["the quick brown fox."] @@ -112,15 +102,13 @@ def test_error_id_out_of_vocabulary(self): with self.assertRaises(ValueError): tokenizer.id_to_token(-1) - def test_from_file(self): - filepath = os.path.join(self.get_temp_dir(), "model.txt") - input_data = ["the quick brown fox."] - with tf.io.gfile.GFile(filepath, "wb") as file: - file.write(self.proto) + def test_from_bytes(self): + with tf.io.gfile.GFile(self.proto, "rb") as file: + proto = file.read() tokenizer = SentencePieceTokenizer( - proto=filepath, + proto=proto, ) - output_data = tokenizer(input_data) + output_data = tokenizer(["the quick brown fox."]) self.assertAllEqual(output_data, [[6, 5, 3, 4]]) def test_tokenize_then_batch(self): @@ -173,6 +161,7 @@ def test_config(self): cloned_tokenizer = SentencePieceTokenizer.from_config( original_tokenizer.get_config() ) + cloned_tokenizer.set_proto(original_tokenizer.proto) self.assertAllEqual( original_tokenizer(input_data), cloned_tokenizer(input_data), diff --git a/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py b/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py index 5a16a76fc0..5fe8f0144d 100644 --- a/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py +++ b/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py @@ -18,7 +18,7 @@ from keras_nlp.tokenizers import tokenizer from keras_nlp.utils.tensor_utils import assert_tf_text_installed from keras_nlp.utils.tensor_utils import convert_to_ragged_batch -from keras_nlp.utils.tensor_utils import is_integer_dtype +from keras_nlp.utils.tensor_utils import is_int_dtype try: import tensorflow_text as tf_text @@ -219,7 +219,7 @@ def __init__( ) -> None: assert_tf_text_installed(self.__class__.__name__) - if not is_integer_dtype(dtype): + if not is_int_dtype(dtype): raise ValueError( "Output dtype must be an integer type. " f"Received: dtype={dtype}" diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py index dc9ce49427..75f956899f 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer.py @@ -19,13 +19,14 @@ import tensorflow as tf from keras_nlp.api_export import keras_nlp_export -from keras_nlp.backend import keras from keras_nlp.tokenizers import tokenizer +from keras_nlp.utils.preset_utils import check_preset_class +from keras_nlp.utils.preset_utils import load_from_preset from keras_nlp.utils.python_utils import classproperty from keras_nlp.utils.python_utils import format_docstring from keras_nlp.utils.tensor_utils import assert_tf_text_installed from keras_nlp.utils.tensor_utils import convert_to_ragged_batch -from keras_nlp.utils.tensor_utils import is_integer_dtype +from keras_nlp.utils.tensor_utils import is_int_dtype from keras_nlp.utils.tensor_utils import is_string_dtype try: @@ -33,6 +34,8 @@ except ImportError: tf_text = None +VOCAB_FILENAME = "vocabulary.txt" + # Matches whitespace and control characters. WHITESPACE_REGEX = r"|".join( [ @@ -305,26 +308,13 @@ def __init__( ) -> None: assert_tf_text_installed(self.__class__.__name__) - if not is_integer_dtype(dtype) and not is_string_dtype(dtype): + if not is_int_dtype(dtype) and not is_string_dtype(dtype): raise ValueError( "Output dtype must be an integer type or a string. " f"Received: dtype={dtype}" ) super().__init__(dtype=dtype, **kwargs) - - if isinstance(vocabulary, str): - self.vocabulary = [ - line.rstrip() for line in tf.io.gfile.GFile(vocabulary) - ] - elif isinstance(vocabulary, Iterable): - # Make a copy. - self.vocabulary = list(vocabulary) - else: - raise ValueError( - "Vocabulary must be an file path or list of terms. " - f"Received: vocabulary={vocabulary}" - ) if oov_token is None: raise ValueError("`oov_token` cannot be None.") @@ -335,8 +325,38 @@ def __init__( self.split_on_cjk = split_on_cjk self.suffix_indicator = suffix_indicator self.oov_token = oov_token + self.set_vocabulary(vocabulary) + + def save_assets(self, dir_path): + path = os.path.join(dir_path, VOCAB_FILENAME) + with open(path, "w", encoding="utf-8") as file: + for token in self.vocabulary: + file.write(f"{token}\n") + + def load_assets(self, dir_path): + path = os.path.join(dir_path, VOCAB_FILENAME) + self.set_vocabulary(path) + + def set_vocabulary(self, vocabulary): + """Set the tokenizer vocabulary to a file or list of strings.""" + if vocabulary is None: + self.vocabulary = None + self._fast_word_piece = None + return + + if isinstance(vocabulary, str): + with open(vocabulary, "r", encoding="utf-8") as file: + self.vocabulary = [line.rstrip() for line in file] + elif isinstance(vocabulary, Iterable): + # Make a defensive copy. + self.vocabulary = list(vocabulary) + else: + raise ValueError( + "Vocabulary must be an file path or list of terms. " + f"Received: vocabulary={vocabulary}" + ) - if oov_token not in self.vocabulary: + if self.oov_token not in self.vocabulary: raise ValueError( f'Cannot find `oov_token="{self.oov_token}"` in the ' "vocabulary.\n" @@ -348,22 +368,25 @@ def __init__( self._fast_word_piece = tf_text.FastWordpieceTokenizer( vocab=self.vocabulary, token_out_type=self.compute_dtype, - suffix_indicator=suffix_indicator, - unknown_token=oov_token, + suffix_indicator=self.suffix_indicator, + unknown_token=self.oov_token, no_pretokenization=True, support_detokenization=True, ) def get_vocabulary(self) -> List[str]: """Get the tokenizer vocabulary as a list of strings tokens.""" + self._check_vocabulary() return self.vocabulary def vocabulary_size(self) -> int: """Get the size of the tokenizer vocabulary.""" + self._check_vocabulary() return len(self.vocabulary) def id_to_token(self, id: int) -> str: """Convert an integer id to a string token.""" + self._check_vocabulary() if id >= self.vocabulary_size() or id < 0: raise ValueError( f"`id` must be in range [0, {self.vocabulary_size() - 1}]. " @@ -376,16 +399,14 @@ def token_to_id(self, token: str) -> int: # This will be slow, but keep memory usage down compared to building a # . Assuming the main use case is looking up a few special tokens # early in the vocab, this should be fine. + self._check_vocabulary() return self.vocabulary.index(token) def get_config(self): config = super().get_config() config.update( { - # Ideally a vocabulary would be saved as a plain text asset in - # the saved model. We have no good way to support this - # currently, so we save the vocabulary in the config. - "vocabulary": self.vocabulary, + "vocabulary": None, # Save vocabulary via an asset! "sequence_length": self.sequence_length, "lowercase": self.lowercase, "strip_accents": self.strip_accents, @@ -396,7 +417,15 @@ def get_config(self): ) return config + def _check_vocabulary(self): + if self.vocabulary is None: + raise ValueError( + "No vocabulary has been set for WordPieceTokenizer. Make sure " + "to pass a `vocabulary` argument when creating the layer." + ) + def tokenize(self, inputs): + self._check_vocabulary() if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)): inputs = tf.convert_to_tensor(inputs) @@ -429,6 +458,7 @@ def tokenize(self, inputs): return tokens def detokenize(self, inputs): + self._check_vocabulary() inputs, unbatched, _ = convert_to_ragged_batch(inputs) outputs = self._fast_word_piece.detokenize(inputs) if unbatched: @@ -462,35 +492,19 @@ def from_preset( tokenizer.detokenize([5, 6, 7, 8, 9]) ``` """ - - if not cls.presets: - raise NotImplementedError( - "No presets have been created for this class" - ) - - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - vocabulary = keras.utils.get_file( - "vocab.txt", - metadata["vocabulary_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["vocabulary_hash"], + # We support short IDs for official presets, e.g. `"bert_base_en"`. + # Map these to a Kaggle Models handle. + if preset in cls.presets: + preset = cls.presets[preset]["kaggle_handle"] + + config_file = "tokenizer.json" + check_preset_class(preset, cls, config_file=config_file) + return load_from_preset( + preset, + config_file=config_file, + config_overrides=kwargs, ) - config = metadata["preprocessor_config"] - config.update( - { - "vocabulary": vocabulary, - }, - ) - - return cls.from_config({**config, **kwargs}) - def __init_subclass__(cls, **kwargs): # Use __init_subclass__ to setup a correct docstring for from_preset. super().__init_subclass__(**kwargs) diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py index 7ba691c5c7..ead098c36c 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py @@ -190,6 +190,7 @@ def test_config(self): cloned_tokenizer = WordPieceTokenizer.from_config( original_tokenizer.get_config() ) + cloned_tokenizer.set_vocabulary(original_tokenizer.get_vocabulary()) self.assertAllEqual( original_tokenizer(input_data), cloned_tokenizer(input_data), diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py b/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py index 8571097e06..dc90075a5c 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py @@ -172,7 +172,7 @@ def normalize_and_split(x): if vocabulary_output_file is not None: vocab_text = "".join([line + "\n" for line in vocab]) # Write vocab to file. - with open(vocabulary_output_file, "w") as vocab_file: + with open(vocabulary_output_file, "w", encoding="utf-8") as vocab_file: vocab_file.write(vocab_text) else: return vocab diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py index 62d0f4adf1..03186944bc 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer_trainer_test.py @@ -177,7 +177,7 @@ def test_output_file(self): reserved_tokens=[], ) vocab_from_file = [] - with open(vocab_file, "r") as f: + with open(vocab_file, "r", encoding="utf-8") as f: for line in f: vocab_from_file.append(line.strip()) self.assertAllEqual(vocab_from_file, test_output) diff --git a/keras_nlp/utils/keras_utils.py b/keras_nlp/utils/keras_utils.py index 6e4d43193c..96750754a2 100644 --- a/keras_nlp/utils/keras_utils.py +++ b/keras_nlp/utils/keras_utils.py @@ -155,3 +155,8 @@ def print_row(fields, positions, print_fn, nested_level=0): line += " " * (positions[col] - len(line)) line += "|" * nested_level print_fn(line) + + +@keras.saving.register_keras_serializable(package="keras_nlp") +def gelu_approximate(x): + return keras.activations.gelu(x, approximate=True) diff --git a/keras_nlp/utils/preset_utils.py b/keras_nlp/utils/preset_utils.py new file mode 100644 index 0000000000..6bb2748fd9 --- /dev/null +++ b/keras_nlp/utils/preset_utils.py @@ -0,0 +1,210 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import json +import os + +from keras_nlp.backend import keras + +try: + import kagglehub +except ImportError: + kagglehub = None + +KAGGLE_PREFIX = "kaggle://" +GS_PREFIX = "gs://" +TOKENIZER_ASSET_DIR = "assets/tokenizer" + + +def get_file(preset, path): + """Download a preset file in necessary and return the local path.""" + if not isinstance(preset, str): + raise ValueError( + f"A preset identifier must be a string. Received: preset={preset}" + ) + if preset.startswith(KAGGLE_PREFIX): + if kagglehub is None: + raise ImportError( + "`from_preset()` requires the `kagglehub` package. " + "Please install with `pip install kagglehub`." + ) + kaggle_handle = preset.removeprefix(KAGGLE_PREFIX) + num_segments = len(kaggle_handle.split("/")) + if num_segments not in (4, 5): + raise ValueError( + "Unexpected Kaggle preset. Kaggle model handles should have " + "the form kaggle://{org}/{model}/keras/{variant}[/{version}]. " + "For example, 'kaggle://username/bert/keras/bert_base_en' or " + "'kaggle://username/bert/keras/bert_base_en/1' (to specify a " + f"version). Received: preset={preset}" + ) + return kagglehub.model_download(kaggle_handle, path) + elif preset.startswith(GS_PREFIX): + url = os.path.join(preset, path) + url = url.replace(GS_PREFIX, "https://storage.googleapis.com/") + subdir = preset.replace(GS_PREFIX, "gs_") + subdir = subdir.replace("/", "_").replace("-", "_") + filename = os.path.basename(path) + subdir = os.path.join(subdir, os.path.dirname(path)) + return keras.utils.get_file( + filename, + url, + cache_subdir=os.path.join("models", subdir), + ) + elif os.path.exists(preset): + # Assume a local filepath. + return os.path.join(preset, path) + else: + raise ValueError( + "Unknown preset identifier. A preset must be a one of:\n" + "1) a built in preset identifier like `'bert_base_en'`\n" + "2) a Kaggle Models handle like `'kaggle://keras/bert/keras/bert_base_en'`\n" + "3) a path to a local preset directory like `'./bert_base_en`\n" + "Use `print(cls.presets.keys())` to view all built-in presets for " + "API symbol `cls`.\n" + f"Received: preset='{preset}'" + ) + + +def get_tokenizer(layer): + """Get the tokenizer from any KerasNLP model or layer.""" + # Avoid circular import. + from keras_nlp.tokenizers.tokenizer import Tokenizer + + if isinstance(layer, Tokenizer): + return layer + if hasattr(layer, "tokenizer"): + return layer.tokenizer + if hasattr(layer, "preprocessor"): + return getattr(layer.preprocessor, "tokenizer", None) + return None + + +def recursive_pop(config, key): + """Remove a key from a nested config object""" + config.pop(key, None) + for value in config.values(): + if isinstance(value, dict): + recursive_pop(value, key) + + +def save_to_preset( + layer, + preset, + save_weights=True, + config_filename="config.json", + weights_filename="model.weights.h5", +): + """Save a KerasNLP layer to a preset directory.""" + os.makedirs(preset, exist_ok=True) + + # Save tokenizers assets. + tokenizer = get_tokenizer(layer) + assets = [] + if tokenizer: + asset_dir = os.path.join(preset, TOKENIZER_ASSET_DIR) + os.makedirs(asset_dir, exist_ok=True) + tokenizer.save_assets(asset_dir) + for asset_path in os.listdir(asset_dir): + assets.append(os.path.join(TOKENIZER_ASSET_DIR, asset_path)) + + # Optionally save weights. + save_weights = save_weights and hasattr(layer, "save_weights") + if save_weights: + weights_path = os.path.join(preset, weights_filename) + layer.save_weights(weights_path) + + # Save a serialized Keras object. + config_path = os.path.join(preset, config_filename) + config = keras.saving.serialize_keras_object(layer) + # Include references to weights and assets. + config["assets"] = assets + config["weights"] = weights_filename if save_weights else None + recursive_pop(config, "compile_config") + recursive_pop(config, "build_config") + with open(config_path, "w") as config_file: + config_file.write(json.dumps(config, indent=4)) + + from keras_nlp import __version__ as keras_nlp_version + + keras_version = keras.version() if hasattr(keras, "version") else None + + # Save any associated metadata. + if config_filename == "config.json": + metadata = { + "keras_version": keras_version, + "keras_nlp_version": keras_nlp_version, + "parameter_count": layer.count_params(), + "date_saved": datetime.datetime.now().strftime("%Y-%m-%d@%H:%M:%S"), + } + metadata_path = os.path.join(preset, "metadata.json") + with open(metadata_path, "w") as metadata_file: + metadata_file.write(json.dumps(metadata, indent=4)) + + +def load_from_preset( + preset, + load_weights=True, + config_file="config.json", + config_overrides={}, +): + """Load a KerasNLP layer to a preset directory.""" + # Load a serialized Keras object. + config_path = get_file(preset, config_file) + with open(config_path) as config_file: + config = json.load(config_file) + config["config"] = {**config["config"], **config_overrides} + layer = keras.saving.deserialize_keras_object(config) + + # Load any assets for our tokenizers. + tokenizer = get_tokenizer(layer) + if tokenizer and config["assets"]: + for asset in config["assets"]: + get_file(preset, asset) + config_dir = os.path.dirname(config_path) + asset_dir = os.path.join(config_dir, TOKENIZER_ASSET_DIR) + tokenizer.load_assets(asset_dir) + + # Optionally load weights. + load_weights = load_weights and config["weights"] + if load_weights: + weights_path = get_file(preset, config["weights"]) + layer.load_weights(weights_path) + + return layer + + +def check_preset_class( + preset, + classes, + config_file="config.json", +): + """Validate a preset is being loaded on the correct class.""" + config_path = get_file(preset, config_file) + with open(config_path) as config_file: + config = json.load(config_file) + cls = keras.saving.get_registered_object(config["registered_name"]) + if not isinstance(classes, (tuple, list)): + classes = (classes,) + # Allow subclasses for testing a base class, e.g. + # `check_preset_class(preset, Backbone)` + if not any(issubclass(cls, x) for x in classes): + raise ValueError( + f"Unexpected class in preset `'{preset}'`. " + "When calling `from_preset()` on a class object, the preset class " + f"much match allowed classes. Allowed classes are `{classes}`. " + f"Received: `{cls}`." + ) + return cls diff --git a/keras_nlp/utils/preset_utils_test.py b/keras_nlp/utils/preset_utils_test.py new file mode 100644 index 0000000000..44dc39f477 --- /dev/null +++ b/keras_nlp/utils/preset_utils_test.py @@ -0,0 +1,107 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os + +import pytest +from absl.testing import parameterized + +from keras_nlp.models.albert.albert_classifier import AlbertClassifier +from keras_nlp.models.backbone import Backbone +from keras_nlp.models.bert.bert_classifier import BertClassifier +from keras_nlp.models.roberta.roberta_classifier import RobertaClassifier +from keras_nlp.models.task import Task +from keras_nlp.tests.test_case import TestCase +from keras_nlp.utils.preset_utils import check_preset_class +from keras_nlp.utils.preset_utils import load_from_preset +from keras_nlp.utils.preset_utils import save_to_preset + + +class PresetUtilsTest(TestCase): + @parameterized.parameters( + (AlbertClassifier, "albert_base_en_uncased", "sentencepiece"), + (RobertaClassifier, "roberta_base_en", "bytepair"), + (BertClassifier, "bert_tiny_en_uncased", "wordpiece"), + ) + @pytest.mark.keras_3_only + @pytest.mark.large + def test_preset_saving(self, cls, preset_name, tokenizer_type): + save_dir = self.get_temp_dir() + model = cls.from_preset(preset_name, num_classes=2) + save_to_preset(model, save_dir) + + if tokenizer_type == "bytepair": + vocab_filename = "assets/tokenizer/vocabulary.json" + expected_assets = [ + "assets/tokenizer/vocabulary.json", + "assets/tokenizer/merges.txt", + ] + elif tokenizer_type == "sentencepiece": + vocab_filename = "assets/tokenizer/vocabulary.spm" + expected_assets = ["assets/tokenizer/vocabulary.spm"] + else: + vocab_filename = "assets/tokenizer/vocabulary.txt" + expected_assets = ["assets/tokenizer/vocabulary.txt"] + + # Check existence of files + self.assertTrue(os.path.exists(os.path.join(save_dir, vocab_filename))) + self.assertTrue(os.path.exists(os.path.join(save_dir, "config.json"))) + self.assertTrue( + os.path.exists(os.path.join(save_dir, "model.weights.h5")) + ) + self.assertTrue(os.path.exists(os.path.join(save_dir, "metadata.json"))) + + # Check the model config (`config.json`) + config_json = open(os.path.join(save_dir, "config.json"), "r").read() + self.assertTrue( + "build_config" not in config_json + ) # Test on raw json to include nested keys + self.assertTrue( + "compile_config" not in config_json + ) # Test on raw json to include nested keys + config = json.loads(config_json) + self.assertEqual(set(config["assets"]), set(expected_assets)) + self.assertEqual(config["weights"], "model.weights.h5") + + # Try loading the model from preset directory + self.assertEqual(cls, check_preset_class(save_dir, cls)) + self.assertEqual(cls, check_preset_class(save_dir, Task)) + with self.assertRaises(ValueError): + # Preset is a subclass of Task, not Backbone. + check_preset_class(save_dir, Backbone) + + # Try loading the model from preset directory + restored_model = load_from_preset(save_dir) + + train_data = ( + ["the quick brown fox.", "the slow brown fox."], # Features. + ) + model_input_data = model.preprocessor(*train_data) + restored_model_input_data = restored_model.preprocessor(*train_data) + + # Check that saved vocab is equal to the original preset vocab + self.assertAllClose(model_input_data, restored_model_input_data) + + # Check model outputs + self.assertAllEqual( + model(model_input_data), restored_model(restored_model_input_data) + ) + + def test_preset_errors(self): + with self.assertRaisesRegex(ValueError, "must be a string"): + AlbertClassifier.from_preset(AlbertClassifier) + + with self.assertRaisesRegex(ValueError, "Unknown preset identifier"): + AlbertClassifier.from_preset("snaggle://bort/bort/bort") diff --git a/keras_nlp/utils/tensor_utils.py b/keras_nlp/utils/tensor_utils.py index 97df75e74f..a88d80a4da 100644 --- a/keras_nlp/utils/tensor_utils.py +++ b/keras_nlp/utils/tensor_utils.py @@ -15,6 +15,7 @@ import tensorflow as tf from keras_nlp.backend import config +from keras_nlp.backend import keras from keras_nlp.backend import ops try: @@ -62,7 +63,7 @@ def convert_to_backend_tensor_or_python_list(x): If we encounter one of these types in torch or jax, we will instead covert the tensor to simple pythonic types (lists of strings). """ - if isinstance(x, tf.RaggedTensor) or x.dtype == tf.string: + if isinstance(x, tf.RaggedTensor) or getattr(x, "dtype", None) == tf.string: return tensor_to_list(x) return ops.convert_to_tensor(x) @@ -151,19 +152,21 @@ def is_tensor_type(x): return hasattr(x, "__array__") -def is_floating_dtype(dtype): +def standardize_dtype(dtype): + if config.keras_3(): + return keras.backend.standardize_dtype(dtype) if hasattr(dtype, "name"): - dtype = dtype.name - return "float" in dtype + return dtype.name + return dtype -def is_integer_dtype(dtype): - if hasattr(dtype, "name"): - dtype = dtype.name - return "int" in dtype +def is_float_dtype(dtype): + return "float" in standardize_dtype(dtype) + + +def is_int_dtype(dtype): + return "int" in standardize_dtype(dtype) def is_string_dtype(dtype): - if hasattr(dtype, "name"): - dtype = dtype.name - return "string" in dtype + return "string" in standardize_dtype(dtype) diff --git a/keras_nlp/version_utils.py b/keras_nlp/version_utils.py new file mode 100644 index 0000000000..15fede3a08 --- /dev/null +++ b/keras_nlp/version_utils.py @@ -0,0 +1,23 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from keras_nlp.api_export import keras_nlp_export + +# Unique source of truth for the version number. +__version__ = "0.7.0" + + +@keras_nlp_export("keras_nlp.version") +def version(): + return __version__ diff --git a/pip_build.py b/pip_build.py index 7774815e1f..0c83cbb436 100644 --- a/pip_build.py +++ b/pip_build.py @@ -28,6 +28,7 @@ ``` """ import argparse +import datetime import glob import os import pathlib @@ -45,56 +46,104 @@ ] -def build(): - if os.path.exists(build_directory): - raise ValueError(f"Directory already exists: {build_directory}") +def ignore_files(_, filenames): + return [f for f in filenames if "_test" in f] + + +def export_version_string(version, is_nightly=False): + """Export Version and Package Name.""" + if is_nightly: + date = datetime.datetime.now() + version += f".dev{date.strftime('%Y%m%d%H')}" + # Replaces `name="keras-nlp"` in `setup.py` with `keras-nlp-nightly` + with open("setup.py") as f: + setup_contents = f.read() + with open("setup.py", "w") as f: + setup_contents = setup_contents.replace( + 'name="keras-nlp"', 'name="keras-nlp-nightly"' + ) + setup_contents = setup_contents.replace( + '"tensorflow-text', '"tf-nightly", "tensorflow-text-nightly' + ) + f.write(setup_contents) + + # Overwrite the version string with our package version. + with open(os.path.join(package, "src", "version_utils.py")) as f: + version_contents = f.readlines() + with open(os.path.join(package, "src", "version_utils.py"), "w") as f: + for line in version_contents: + if line.startswith("__version__"): + f.write(f'__version__ = "{version}"\n') + else: + f.write(line) + # Make sure to export the __version__ string. + with open(os.path.join(package, "__init__.py")) as f: + init_contents = f.read() + with open(os.path.join(package, "__init__.py"), "w") as f: + f.write(init_contents) + f.write("from keras_nlp.src.version_utils import __version__\n") + + +def copy_source_to_build_directory(root_path): + # Copy sources (`keras_nlp/` directory and setup files) to build dir + os.chdir(root_path) + os.mkdir(build_directory) + shutil.copytree( + package, os.path.join(build_directory, package), ignore=ignore_files + ) + for fname in to_copy: + shutil.copy(fname, os.path.join(f"{build_directory}", fname)) + os.chdir(build_directory) + + +def run_namex_conversion(): + # Restructure the codebase so that source files live in `keras_nlp/src` + namex.convert_codebase(package, code_directory="src") + # Generate API __init__.py files in `keras_nlp/` + namex.generate_api_files(package, code_directory="src", verbose=True) + + +def build_and_save_output(root_path, __version__): + """Build the package.""" + os.system("python3 -m build") + + # Save the dist files generated by the build process + os.chdir(root_path) + if not os.path.exists(dist_directory): + os.mkdir(dist_directory) + for fpath in glob.glob( + os.path.join(build_directory, dist_directory, "*.*") + ): + shutil.copy(fpath, dist_directory) + + # Find the .whl file path whl_path = None - try: - # Copy sources (`keras_nlp/` directory and setup files) to build directory - root_path = pathlib.Path(__file__).parent.resolve() - os.chdir(root_path) - os.mkdir(build_directory) - shutil.copytree(package, os.path.join(build_directory, package)) - for fname in to_copy: - shutil.copy(fname, os.path.join(f"{build_directory}", fname)) - os.chdir(build_directory) + for fname in os.listdir(dist_directory): + if __version__ in fname and fname.endswith(".whl"): + whl_path = os.path.abspath(os.path.join(dist_directory, fname)) + if whl_path: + print(f"Build successful. Wheel file available at {whl_path}") + else: + print("Build failed.") + return whl_path - # Restructure the codebase so that source files live in `keras_nlp/src` - namex.convert_codebase(package, code_directory="src") - # Generate API __init__.py files in `keras_nlp/` - namex.generate_api_files(package, code_directory="src", verbose=True) +def build(root_path, is_nightly=False): + if os.path.exists(build_directory): + raise ValueError(f"Directory already exists: {build_directory}") + try: + copy_source_to_build_directory(root_path) + run_namex_conversion() # Make sure to export the __version__ string from keras_nlp.src import __version__ # noqa: E402 - with open(os.path.join(package, "__init__.py")) as f: - init_contents = f.read() - with open(os.path.join(package, "__init__.py"), "w") as f: - f.write(init_contents + "\n\n" + f'__version__ = "{__version__}"\n') - - # Build the package - os.system("python3 -m build") - - # Save the dist files generated by the build process - os.chdir(root_path) - if not os.path.exists(dist_directory): - os.mkdir(dist_directory) - for fpath in glob.glob( - os.path.join(build_directory, dist_directory, "*.*") - ): - shutil.copy(fpath, dist_directory) - - # Find the .whl file path - for fname in os.listdir(dist_directory): - if __version__ in fname and fname.endswith(".whl"): - whl_path = os.path.abspath(os.path.join(dist_directory, fname)) - print(f"Build successful. Wheel file available at {whl_path}") + export_version_string(__version__, is_nightly) + return build_and_save_output(root_path, __version__) finally: # Clean up: remove the build directory (no longer needed) shutil.rmtree(build_directory) - return whl_path def install_whl(whl_fpath): @@ -109,7 +158,13 @@ def install_whl(whl_fpath): action="store_true", help="Whether to install the generated wheel file.", ) + parser.add_argument( + "--nightly", + action="store_true", + help="Whether to generate nightly wheel file.", + ) args = parser.parse_args() - whl_path = build() + root_path = pathlib.Path(__file__).parent.resolve() + whl_path = build(root_path, args.nightly) if whl_path and args.install: install_whl(whl_path) diff --git a/requirements-common.txt b/requirements-common.txt index 21334a40c2..5c2a8a3d90 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -1,8 +1,8 @@ # Library deps. -keras-core>=0.1.6 dm-tree regex rich +kagglehub # Tooling deps. astor packaging @@ -16,3 +16,4 @@ namex # Optional deps. rouge-score sentencepiece +tensorflow-datasets diff --git a/requirements-jax-cuda.txt b/requirements-jax-cuda.txt new file mode 100644 index 0000000000..c09b306264 --- /dev/null +++ b/requirements-jax-cuda.txt @@ -0,0 +1,14 @@ +# Tensorflow cpu-only version. +tf-nightly-cpu==2.16.0.dev20231221 # Pin a working nightly until rc0. +tensorflow-text-nightly==2.16.0.dev20231221 # Pin a working nightly until rc0. + +# Torch cpu-only version. +--extra-index-url https://download.pytorch.org/whl/cpu +torch>=2.1.0 +torchvision>=0.16.0 + +# Jax with cuda support. +--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +jax[cuda12_pip] + +-r requirements-common.txt diff --git a/requirements-macos-m1.txt b/requirements-macos-m1.txt deleted file mode 100644 index 05dd07e604..0000000000 --- a/requirements-macos-m1.txt +++ /dev/null @@ -1,16 +0,0 @@ -# WARNING: KerasNLP has no official support for MacOS M1 at this time. The -# following will pull required depenencies from the following external sources. -# - https://developer.apple.com/metal/tensorflow-plugin/ -# - https://github.com/sun1638650145/Libraries-and-Extensions-for-TensorFlow-for-Apple-Silicon/ -# These are not provided by Google, please review both of these dependencies -# before proceeding. - -# Core deps. -tensorflow-macos~=2.9 -https://github.com/sun1638650145/Libraries-and-Extensions-for-TensorFlow-for-Apple-Silicon/releases/download/v2.9/tensorflow_text-2.9.0-cp39-cp39-macosx_11_0_arm64.whl -tensorflow-datasets -# The metal plugin breaks many tests, so is not enabled by default. -# tensorflow-metal~=0.5 - -# Common deps. --r requirements-common.txt diff --git a/requirements-nightly.txt b/requirements-nightly.txt deleted file mode 100644 index 22a0c6e55a..0000000000 --- a/requirements-nightly.txt +++ /dev/null @@ -1,7 +0,0 @@ -# Core deps. -tf-nightly -tensorflow-text-nightly -tfds-nightly - -# Common deps. --r requirements-common.txt diff --git a/requirements-tensorflow-cuda.txt b/requirements-tensorflow-cuda.txt new file mode 100644 index 0000000000..21a8ed2463 --- /dev/null +++ b/requirements-tensorflow-cuda.txt @@ -0,0 +1,13 @@ +# Tensorflow with cuda support. +tf-nightly[and-cuda]==2.16.0.dev20231221 # Pin a working nightly until rc0. +tensorflow-text-nightly==2.16.0.dev20231221 # Pin a working nightly until rc0. + +# Torch cpu-only version. +--extra-index-url https://download.pytorch.org/whl/cpu +torch>=2.1.0 +torchvision>=0.16.0 + +# Jax cpu-only version. +jax[cpu] + +-r requirements-common.txt diff --git a/requirements-torch-cuda.txt b/requirements-torch-cuda.txt new file mode 100644 index 0000000000..c71c51e478 --- /dev/null +++ b/requirements-torch-cuda.txt @@ -0,0 +1,13 @@ +# Tensorflow cpu-only version. +tf-nightly-cpu==2.16.0.dev20231221 # Pin a working nightly until rc0. +tensorflow-text-nightly==2.16.0.dev20231221 # Pin a working nightly until rc0. + +# Torch with cuda support. +--extra-index-url https://download.pytorch.org/whl/cu121 +torch==2.1.2 +torchvision==0.16.2 + +# Jax cpu-only version. +jax[cpu] + +-r requirements-common.txt diff --git a/requirements.txt b/requirements.txt index 99adbbd656..fa1dc91943 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,13 @@ -# Core deps. -tensorflow~=2.13.0 -tensorflow-text~=2.13.0 -tensorflow-datasets +# Tensorflow. +tf-nightly-cpu==2.16.0.dev20231221 # Pin a working nightly until rc0. +tensorflow-text-nightly==2.16.0.dev20231221 # Pin a working nightly until rc0. + +# Torch. +--extra-index-url https://download.pytorch.org/whl/cpu +torch>=2.1.0 +torchvision>=0.16.0 + +# Jax. +jax[cpu] -# Common deps. -r requirements-common.txt diff --git a/setup.py b/setup.py index 13214e70e0..b246d18ea1 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,10 @@ def get_version(rel_path): HERE = pathlib.Path(__file__).parent README = (HERE / "README.md").read_text() +if os.path.exists("keras_nlp/version_utils.py"): + VERSION = get_version("keras_nlp/version_utils.py") +else: + VERSION = get_version("keras_nlp/src/version_utils.py") setup( name="keras-nlp", @@ -45,7 +49,7 @@ def get_version(rel_path): ), long_description=README, long_description_content_type="text/markdown", - version=get_version("keras_nlp/__init__.py"), + version=VERSION, url="https://github.com/keras-team/keras-nlp", author="Keras team", author_email="keras-nlp@google.com", @@ -58,6 +62,7 @@ def get_version(rel_path): "regex", "rich", "dm-tree", + "kagglehub", # Don't require tensorflow-text on MacOS, there are no binaries for ARM. # Also, we rely on tensorflow *transitively* through tensorflow-text. # This avoid a slowdown during `pip install keras-nlp` where pip would diff --git a/tools/checkpoint_conversion/convert_llama_checkpoints.py b/tools/checkpoint_conversion/convert_llama_checkpoints.py new file mode 100644 index 0000000000..5eb3973f36 --- /dev/null +++ b/tools/checkpoint_conversion/convert_llama_checkpoints.py @@ -0,0 +1,141 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import torch +from transformers import AutoModel + +from keras_nlp.models.llama.llama_backbone import LlamaBackbone + +os.environ["KERAS_BACKEND"] = "torch" + +# from huggingface_hub import login +# llama weights as of now are on request access +# login(token=' torch.Tensor: + """ + freqs_cis: complex - (seq_len, head_dim / 2) + x: complex - (bsz, seq_len, head_dim / 2) + """ + ndim = x.ndim + assert 1 < ndim + assert freqs_cis.shape == (x.shape[1], x.shape[-1]), ( + freqs_cis.shape, + (x.shape[1], x.shape[-1]), + ) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = _reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +class Attention(nn.Module): + def __init__(self, args: ModelArgs): + super().__init__() + self.args = args + + self.n_heads: int = args.n_heads + self.n_kv_heads: int = args.n_kv_heads + + self.repeats = self.n_heads // self.n_kv_heads + self.sliding_window = self.args.sliding_window + + self.scale = self.args.head_dim**-0.5 + + self.wq = nn.Linear(args.dim, args.n_heads * args.head_dim, bias=False) + self.wk = nn.Linear( + args.dim, args.n_kv_heads * args.head_dim, bias=False + ) + self.wv = nn.Linear( + args.dim, args.n_kv_heads * args.head_dim, bias=False + ) + self.wo = nn.Linear(args.n_heads * args.head_dim, args.dim, bias=False) + self.cache_k = torch.empty( + ( + args.max_batch_size, + args.sliding_window, + self.n_kv_heads, + self.args.head_dim, + ), + dtype=torch.float16, + ) + self.cache_v = torch.empty( + ( + args.max_batch_size, + args.sliding_window, + self.n_kv_heads, + self.args.head_dim, + ), + dtype=torch.float16, + ) + + def forward( + self, + x: torch.Tensor, + freqs_cis: torch.Tensor, + positions: torch.Tensor, + mask: Optional[torch.Tensor], + ) -> torch.Tensor: + bsz, seqlen, _ = x.shape + + xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + xq = xq.view(bsz, seqlen, self.n_heads, self.args.head_dim) + xk = xk.view(bsz, seqlen, self.n_kv_heads, self.args.head_dim) + xv = xv.view(bsz, seqlen, self.n_kv_heads, self.args.head_dim) + xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) + + # The cache is a rotating buffer + scatter_pos = (positions[-self.sliding_window :] % self.sliding_window)[ + None, :, None, None + ] + scatter_pos = scatter_pos.repeat( + bsz, 1, self.n_kv_heads, self.args.head_dim + ) + self.cache_k[:bsz].scatter_( + dim=1, + index=scatter_pos, + src=xk[:, -self.sliding_window :].to(self.cache_k.dtype), + ) + self.cache_v[:bsz].scatter_( + dim=1, + index=scatter_pos, + src=xv[:, -self.sliding_window :].to(self.cache_v.dtype), + ) + + if positions.shape[0] > 1: + # prefill + key, value = repeat_kv(xk, xv, self.repeats) + else: + cur_pos = positions[-1].item() + 1 + key, value = repeat_kv( + self.cache_k[:bsz, :cur_pos, ...].to(xk.dtype), + self.cache_v[:bsz, :cur_pos, ...].to(xv.dtype), + self.repeats, + ) + + query = xq.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + # scores : [bsz, n_heads, seqlen | 1, seqlen] + scores = torch.matmul(query, key.transpose(2, 3)) * self.scale + + if mask is not None: + scores += mask[None, None, ...] + + scores = scores.float() + scores = nn.functional.softmax(scores, dim=-1).type_as(query) + output = torch.matmul( + scores, value + ) # (bs, n_local_heads, slen, head_dim) + output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) + return self.wo(output) + + +class FeedForward(nn.Module): + def __init__(self, args: ModelArgs): + super().__init__() + + self.w1 = nn.Linear(args.dim, args.hidden_dim, bias=False) + self.w2 = nn.Linear(args.hidden_dim, args.dim, bias=False) + self.w3 = nn.Linear(args.dim, args.hidden_dim, bias=False) + + def forward(self, x) -> torch.Tensor: + return self.w2(nn.functional.silu(self.w1(x)) * self.w3(x)) + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()).type_as(x) + return output * self.weight + + +class TransformerBlock(nn.Module): + def __init__(self, args: ModelArgs): + super().__init__() + self.n_heads = args.n_heads + self.dim = args.dim + self.attention = Attention(args) + self.feed_forward = FeedForward(args=args) + self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) + self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) + self.args = args + + def forward( + self, + x: torch.Tensor, + freqs_cis: torch.Tensor, + positions: torch.Tensor, + mask: Optional[torch.Tensor], + ) -> torch.Tensor: + r = self.attention.forward( + self.attention_norm(x), freqs_cis, positions, mask + ) + h = x + r + r = self.feed_forward.forward(self.ffn_norm(h)) + out = h + r + return out + + +def precompute_freqs_cis( + dim: int, end: int, theta: float = 10000.0 +) -> torch.Tensor: + freqs = 1.0 / ( + theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim) + ) + t = torch.arange(end, device=freqs.device) # type: ignore + freqs = torch.outer(t, freqs).float() # type: ignore + return torch.polar(torch.ones_like(freqs), freqs) # complex64 + + +class TorchTransformer(nn.Module): + def __init__(self, args: ModelArgs): + super().__init__() + self.args = args + self.vocab_size = args.vocab_size + self.n_layers = args.n_layers + assert self.vocab_size > 0 + + self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim) + + self.layers = torch.nn.ModuleList( + [TransformerBlock(args=args) for _ in range(args.n_layers)] + ) + + self.norm = RMSNorm(args.dim, eps=args.norm_eps) + + self.output = nn.Linear(args.dim, args.vocab_size, bias=False) + + self.freqs_cis = precompute_freqs_cis(self.args.head_dim, 128_000) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + ): + h = self.tok_embeddings(input_ids) + freqs_cis = self.freqs_cis[positions] + + mask: Optional[torch.Tensor] = None + if input_ids.shape[1] > 1: + seqlen = input_ids.shape[1] + tensor = torch.full( + (seqlen, seqlen), + dtype=h.dtype, + fill_value=1, + device=h.device, + ) + mask = torch.tril(tensor, diagonal=0).to(h.dtype) + # make the mask banded to account for sliding window + mask = torch.triu(mask, diagonal=-self.args.sliding_window) + mask = torch.log(mask) + + for layer in self.layers: + h = layer(h, freqs_cis, positions, mask) + + return self.output(self.norm(h)).float() + + @staticmethod + def from_folder( + folder: Path, max_batch_size: int = 1, device="cpu", dtype=torch.float16 + ): + with open(folder / "params.json", "r") as f: + model_args = ModelArgs(**json.loads(f.read())) + model_args.max_batch_size = max_batch_size + model = TorchTransformer(model_args).to(device=device, dtype=dtype) + loaded = torch.load(folder / "consolidated.00.pth") + model.load_state_dict(loaded) + return model + + +def port_weights( + model_k3: MistralBackbone, model_torch: TorchTransformer, params: ModelArgs +): + model_k3.get_layer("token_embedding").embeddings.assign( + model_torch.tok_embeddings.weight.detach().cpu().numpy() + ) + + for i in range(model_k3.num_layers): + model_k3.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._key_dense.set_weights( + [ + model_torch.layers[i] + .attention.wk.weight.T.reshape( + params.dim, params.n_kv_heads, params.head_dim + ) + .detach() + .cpu() + .numpy() + ] + ) + model_k3.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._query_dense.set_weights( + [ + model_torch.layers[i] + .attention.wq.weight.T.reshape( + params.dim, params.n_heads, params.head_dim + ) + .detach() + .cpu() + .numpy() + ] + ) + model_k3.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._value_dense.set_weights( + [ + model_torch.layers[i] + .attention.wv.weight.T.reshape( + params.dim, params.n_kv_heads, params.head_dim + ) + .detach() + .cpu() + .numpy() + ] + ) + model_k3.get_layer( + f"transformer_layer_{i}" + )._self_attention_layer._output_dense.set_weights( + [ + model_torch.layers[i] + .attention.wo.weight.T.reshape( + params.n_heads, params.head_dim, params.dim + ) + .detach() + .cpu() + .numpy() + ] + ) + model_k3.get_layer( + f"transformer_layer_{i}" + )._self_attention_layernorm.set_weights( + [model_torch.layers[i].attention_norm.weight.detach().cpu().numpy()] + ) + model_k3.get_layer( + f"transformer_layer_{i}" + )._feedforward_intermediate_dense.set_weights( + [ + model_torch.layers[i] + .feed_forward.w3.weight.T.detach() + .cpu() + .numpy() + ] + ) + model_k3.get_layer( + f"transformer_layer_{i}" + )._feedforward_output_dense.set_weights( + [ + model_torch.layers[i] + .feed_forward.w2.weight.T.detach() + .cpu() + .numpy() + ] + ) + model_k3.get_layer( + f"transformer_layer_{i}" + )._feedforward_gate_dense.set_weights( + [ + model_torch.layers[i] + .feed_forward.w1.weight.T.detach() + .cpu() + .numpy() + ] + ) + model_k3.get_layer( + f"transformer_layer_{i}" + )._feedforward_layernorm.set_weights( + [model_torch.layers[i].ffn_norm.weight.detach().cpu().numpy()] + ) + + model_k3.get_layer("sequence_output_layernorm").set_weights( + [model_torch.norm.weight.detach().cpu().numpy()] + ) + model_k3.get_layer("token_embedding").reverse_embeddings.assign( + model_torch.output.weight.T.detach().cpu().numpy() + ) + + +if __name__ == "__main__": + with open(MODEL_PATH / "params.json", "r") as params_file: + params = ModelArgs(**json.load(params_file)) + + model_torch = TorchTransformer.from_folder( + MODEL_PATH, device="cpu", dtype=torch.float16 + ) + print("Torch model loaded") + model_k3 = MistralBackbone( + vocabulary_size=32000, + hidden_dim=4096, + num_layers=32, + num_query_heads=32, + num_key_value_heads=8, + intermediate_dim=14336, + sliding_window=4096, + layer_norm_epsilon=1e-6, + dtype="float16", + ) + print("Keras 3 model loaded.") + + port_weights(model_k3, model_torch, params) + print("Weight transfer done.") + + model_k3.save_weights("mistral_7b.weights.h5") + print("Weights saved.") diff --git a/tools/checkpoint_conversion/convert_t5_checkpoints.py b/tools/checkpoint_conversion/convert_t5_checkpoints.py new file mode 100644 index 0000000000..89a365f00f --- /dev/null +++ b/tools/checkpoint_conversion/convert_t5_checkpoints.py @@ -0,0 +1,389 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import os +import shutil + +import numpy as np +import transformers +from absl import app +from absl import flags +from checkpoint_conversion_utils import get_md5_checksum +from keras import ops + +import keras_nlp + +PRESET_MAP = { + "t5_small_multi": "t5-small", + "t5_base_multi": "t5-base", + "t5_large_multi": "t5-large", + "flan_small_multi": "google/flan-t5-small", + "flan_base_multi": "google/flan-t5-base", + "flan_large_multi": "google/flan-t5-large", +} + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "preset", "t5_base_multi", f'Must be one of {",".join(PRESET_MAP.keys())}' +) +os.environ["KERAS_BACKEND"] = "torch" + + +def extract_vocab(hf_tokenizer): + proto_path = f"./{FLAGS.preset}/vocab.spm" + print(f"\n-> Save KerasNLP vocab to `{proto_path}`.") + + # Huggingface has a save_vocabulary function but it's not byte-for-byte + # with the source. Instead copy the original downloaded file directly. + shutil.copyfile( + transformers.utils.hub.get_file_from_repo( + hf_tokenizer.name_or_path, "spiece.model" + ), + proto_path, + ) + + keras_tokenizer = keras_nlp.models.T5Tokenizer( + proto=proto_path, + ) + + print("-> Print MD5 checksum of the vocab files.") + print(f"`{proto_path}` md5sum: ", get_md5_checksum(proto_path)) + + return keras_tokenizer + + +def convert_checkpoints(hf_model): + keras_nlp_model = keras_nlp.models.T5Backbone.from_preset( + FLAGS.preset, load_weights=False + ) + + hf_wts = hf_model.state_dict() + print("Original weights:") + print(list(hf_wts.keys())) + + for i in range(keras_nlp_model.num_layers): + for section in ["encoder", "decoder"]: + n = 0 + + # Token embedding layer + keras_nlp_model.get_layer("token_embedding").embeddings.assign( + hf_wts[f"{section}.embed_tokens.weight"] + ) + if not keras_nlp_model.tie_embedding_weights: + keras_nlp_model.get_layer( + "token_embedding" + ).reverse_embeddings.assign( + hf_wts["lm_head.weight"].transpose(1, 0).numpy() + ) + + # Query, key, value, and output projectors in self-attention + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).self_attention.query_projector.kernel.assign( + hf_wts[f"{section}.block.{i}.layer.{n}.SelfAttention.q.weight"] + .transpose(1, 0) + .numpy() + ) + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).self_attention.key_projector.kernel.assign( + hf_wts[f"{section}.block.{i}.layer.{n}.SelfAttention.k.weight"] + .transpose(1, 0) + .numpy() + ) + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).self_attention.value_projector.kernel.assign( + hf_wts[f"{section}.block.{i}.layer.{n}.SelfAttention.v.weight"] + .transpose(1, 0) + .numpy() + ) + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).self_attention.output_projector.kernel.assign( + hf_wts[f"{section}.block.{i}.layer.{n}.SelfAttention.o.weight"] + .transpose(1, 0) + .numpy() + ) + + # Add relative attention bias + if keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).self_attention.use_relative_attention_bias: + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).self_attention.relative_attention_bias.assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.SelfAttention.relative_attention_bias.weight" + ].numpy() + ) + + # Self-attention norm + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).self_attention_layer_norm.weight.assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.layer_norm.weight" + ].numpy() + ) + + # Increment for next layer + n += 1 + + if section == "decoder": + # Cross-attention QKV and output proj (one between encoder and decoder) + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).cross_attention.query_projector.kernel.assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.EncDecAttention.q.weight" + ] + .transpose(1, 0) + .numpy() + ) + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).cross_attention.key_projector.kernel.assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.EncDecAttention.k.weight" + ] + .transpose(1, 0) + .numpy() + ) + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).cross_attention.value_projector.kernel.assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.EncDecAttention.v.weight" + ] + .transpose(1, 0) + .numpy() + ) + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).cross_attention.output_projector.kernel.assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.EncDecAttention.o.weight" + ] + .transpose(1, 0) + .numpy() + ) + + # Cross-attention layer norm + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).cross_attention_layer_norm.weight.assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.layer_norm.weight" + ].numpy() + ) + # Increment for next layer + n += 1 + + if keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).use_gated_activation: + # Input projection layer + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).input_projector.weights[0].assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.DenseReluDense.wi_0.weight" + ] + .transpose(1, 0) + .numpy() + ) + + # Gated activation layer + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).gate_projector.weights[0].assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.DenseReluDense.wi_1.weight" + ] + .transpose(1, 0) + .numpy() + ) + else: + # Input projection layer + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).input_projector.weights[0].assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.DenseReluDense.wi.weight" + ] + .transpose(1, 0) + .numpy() + ) + + # Output projection layer + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).output_projector.weights[0].assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.DenseReluDense.wo.weight" + ] + .transpose(1, 0) + .numpy() + ) + + # Layer norm + keras_nlp_model.get_layer( + f"transformer_{section}_layer_{i}" + ).layer_norm.weight.assign( + hf_wts[ + f"{section}.block.{i}.layer.{n}.layer_norm.weight" + ].numpy() + ) + + # Final normalization + keras_nlp_model.get_layer(f"{section}_output_layer_norm").weights[ + -1 + ].assign(hf_wts[f"{section}.final_layer_norm.weight"].numpy()) + + return keras_nlp_model + + +def check_output( + keras_model, + keras_tokenizer, + hf_model, + hf_tokenizer, +): + print("\n-> Compare the outputs.") + encoder_input = ["the quick brown fox jumped."] + decoder_input = ["the quick brown fox fell."] + + sequence_length = 12 + + # KerasNLP Tokenization + packer = keras_nlp.layers.StartEndPacker( + sequence_length=sequence_length, + pad_value=keras_tokenizer.pad_token_id, + end_value=keras_tokenizer.end_token_id, + ) + encoder_token_ids = packer(keras_tokenizer(encoder_input)) + encoder_padding_mask = encoder_token_ids != keras_tokenizer.pad_token_id + decoder_token_ids = packer(keras_tokenizer(decoder_input)) + decoder_padding_mask = decoder_token_ids != keras_tokenizer.pad_token_id + keras_inputs = { + "encoder_token_ids": encoder_token_ids, + "encoder_padding_mask": encoder_padding_mask, + "decoder_token_ids": decoder_token_ids, + "decoder_padding_mask": decoder_padding_mask, + } + + # HF Tokenization. + hf_encoder_inputs = hf_tokenizer( + encoder_input, + padding="max_length", + max_length=sequence_length, + return_tensors="pt", + ) + hf_decoder_inputs = hf_tokenizer( + decoder_input, + padding="max_length", + max_length=sequence_length, + return_tensors="pt", + ) + hf_inputs = { + "input_ids": hf_encoder_inputs["input_ids"], + "attention_mask": hf_encoder_inputs["attention_mask"], + "decoder_input_ids": hf_decoder_inputs["input_ids"], + "decoder_attention_mask": hf_decoder_inputs["attention_mask"], + } + + # Compare tokenized inputs. This should be a compete match. + print("-> KerasNLP inputs:") + for k, v in keras_inputs.items(): + print(k, v) + print("-> HF inputs:") + for k, v in hf_inputs.items(): + print(k, v) + + # Forward pass + keras_out = keras_model(keras_inputs) + hf_out = hf_model(**hf_inputs, output_hidden_states=True) + + # Only compare non-padded token ids. + keras_hidden_states = keras_out["decoder_sequence_output"] + hf_hidden_states = hf_out.decoder_hidden_states[-1] + + keras_outputs = ops.take_along_axis( + keras_hidden_states, ops.where(decoder_padding_mask) + ) + hf_outputs = ops.take_along_axis( + hf_hidden_states, ops.where(decoder_padding_mask) + ) + + print("-> KerasNLP output:", keras_outputs[0:5]) + print("-> HF output:", hf_outputs[0:5]) + np.testing.assert_allclose( + keras_outputs.detach().numpy(), hf_outputs.detach().numpy(), atol=1e-5 + ) + + if keras_model.tie_embedding_weights: + keras_hidden_states = keras_hidden_states * ( + keras_model.hidden_dim**-0.5 + ) + + keras_logits = keras_model.token_embedding( + keras_hidden_states, reverse=True + ) + hf_logits = hf_out.logits + print("-> KerasNLP logits:", keras_logits[0:5]) + print("-> HF logits:", hf_logits[0:5]) + np.testing.assert_allclose( + keras_logits.detach().numpy(), hf_logits.detach().numpy(), atol=1e-3 + ) + + +def count_params(weights): + shapes = [v.shape for v in weights] + return int(sum(math.prod(p) for p in shapes)) + + +def main(_): + hf_id = PRESET_MAP[FLAGS.preset] + shutil.rmtree(f"./{FLAGS.preset}", ignore_errors=True) + os.mkdir(f"./{FLAGS.preset}") + + print("\n-> Convert weights.") + hf_model = transformers.T5ForConditionalGeneration.from_pretrained(hf_id) + keras_model = convert_checkpoints(hf_model) + + # Save the model. + model_path = f"./{FLAGS.preset}/model.weights.h5" + print(f"\n-> Save KerasNLP model weights to `{model_path}`.") + keras_model.save_weights(model_path) + print("-> Print MD5 checksum of the model weights files.") + print(f"`{model_path}` md5sum: ", get_md5_checksum(model_path)) + print(f"-> Param count {count_params(keras_model.weights)}") + + print("\n-> Convert vocab.") + hf_tokenizer = transformers.AutoTokenizer.from_pretrained(hf_id) + keras_tokenizer = extract_vocab(hf_tokenizer) + + check_output( + keras_model, + keras_tokenizer, + hf_model, + hf_tokenizer, + ) + + +if __name__ == "__main__": + flags.mark_flag_as_required("preset") + app.run(main) diff --git a/tools/convert_legacy_presets.py b/tools/convert_legacy_presets.py new file mode 100644 index 0000000000..c1470cf64a --- /dev/null +++ b/tools/convert_legacy_presets.py @@ -0,0 +1,117 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script was used to convert our legacy presets into the directory format +used by Kaggle. + +This script is for reference only. +""" + +import os +import re +import shutil + +os.environ["KERAS_HOME"] = os.getcwd() + +from keras_nlp import models # noqa: E402 +from keras_nlp.src.utils.preset_utils import save_to_preset # noqa: E402 + +BUCKET = "keras-nlp-kaggle" + + +def to_snake_case(name): + name = re.sub(r"\W+", "", name) + name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + name = re.sub("([a-z])([A-Z])", r"\1_\2", name).lower() + return name + + +if __name__ == "__main__": + backbone_models = [ + (models.AlbertBackbone, models.AlbertTokenizer), + (models.BartBackbone, models.BartTokenizer), + (models.BertBackbone, models.BertTokenizer), + (models.DebertaV3Backbone, models.DebertaV3Tokenizer), + (models.DistilBertBackbone, models.DistilBertTokenizer), + (models.FNetBackbone, models.FNetTokenizer), + (models.GPT2Backbone, models.GPT2Tokenizer), + (models.OPTBackbone, models.OPTTokenizer), + (models.RobertaBackbone, models.RobertaTokenizer), + (models.T5Backbone, models.T5Tokenizer), + (models.WhisperBackbone, models.WhisperTokenizer), + (models.XLMRobertaBackbone, models.XLMRobertaTokenizer), + ] + for backbone_cls, tokenizer_cls in backbone_models: + for preset in backbone_cls.presets: + backbone = backbone_cls.from_preset( + preset, name=to_snake_case(backbone_cls.__name__) + ) + tokenizer = tokenizer_cls.from_preset( + preset, name=to_snake_case(tokenizer_cls.__name__) + ) + save_to_preset( + backbone, + preset, + config_filename="config.json", + ) + save_to_preset( + tokenizer, + preset, + config_filename="tokenizer.json", + ) + # Delete first to clean up any exising version. + os.system(f"gsutil rm -rf gs://{BUCKET}/{preset}") + os.system(f"gsutil cp -r {preset} gs://{BUCKET}/{preset}") + for root, _, files in os.walk(preset): + for file in files: + path = os.path.join(BUCKET, root, file) + os.system( + f"gcloud storage objects update gs://{path} " + "--add-acl-grant=entity=AllUsers,role=READER" + ) + # Clean up local disk usage. + shutil.rmtree("models") + shutil.rmtree(preset) + + # Handle our single task model. + preset = "bert_tiny_en_uncased_sst2" + task = models.BertClassifier.from_preset( + preset, name=to_snake_case(models.BertClassifier.__name__) + ) + tokenizer = models.BertTokenizer.from_preset( + preset, name=to_snake_case(models.BertTokenizer.__name__) + ) + save_to_preset( + task, + preset, + config_filename="config.json", + ) + save_to_preset( + tokenizer, + preset, + config_filename="tokenizer.json", + ) + # Delete first to clean up any exising version. + os.system(f"gsutil rm -rf gs://{BUCKET}/{preset}") + os.system(f"gsutil cp -r {preset} gs://{BUCKET}/{preset}") + for root, _, files in os.walk(preset): + for file in files: + path = os.path.join(BUCKET, root, file) + os.system( + f"gcloud storage objects update gs://{path} " + "--add-acl-grant=entity=AllUsers,role=READER" + ) + # Clean up local disk usage. + shutil.rmtree("models") + shutil.rmtree(preset) diff --git a/tools/sentencepiece_testing/__init__.py b/tools/sentencepiece_testing/__init__.py new file mode 100644 index 0000000000..ba0c2545e4 --- /dev/null +++ b/tools/sentencepiece_testing/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tools/sentencepiece_testing/create_albert_test_proto.py b/tools/sentencepiece_testing/create_albert_test_proto.py new file mode 100644 index 0000000000..80e82b3cd1 --- /dev/null +++ b/tools/sentencepiece_testing/create_albert_test_proto.py @@ -0,0 +1,37 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "albert_test_vocab.spm", + vocab_size=12, + model_type="WORD", + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3, + pad_piece="", + unk_piece="", + bos_piece="[CLS]", + eos_piece="[SEP]", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_deberta_v3_test_proto.py b/tools/sentencepiece_testing/create_deberta_v3_test_proto.py new file mode 100644 index 0000000000..c3f98867c5 --- /dev/null +++ b/tools/sentencepiece_testing/create_deberta_v3_test_proto.py @@ -0,0 +1,37 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "deberta_v3_test_vocab.spm", + vocab_size=12, + model_type="WORD", + pad_id=0, + bos_id=1, + eos_id=2, + unk_id=3, + pad_piece="[PAD]", + bos_piece="[CLS]", + eos_piece="[SEP]", + unk_piece="[UNK]", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_f_net_test_proto.py b/tools/sentencepiece_testing/create_f_net_test_proto.py new file mode 100644 index 0000000000..949a5692f9 --- /dev/null +++ b/tools/sentencepiece_testing/create_f_net_test_proto.py @@ -0,0 +1,37 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "f_net_test_vocab.spm", + vocab_size=12, + model_type="WORD", + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3, + pad_piece="", + unk_piece="", + bos_piece="[CLS]", + eos_piece="[SEP]", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_mistral_test_proto.py b/tools/sentencepiece_testing/create_mistral_test_proto.py new file mode 100644 index 0000000000..1a2a501b7a --- /dev/null +++ b/tools/sentencepiece_testing/create_mistral_test_proto.py @@ -0,0 +1,32 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "mistral_test_vocab.spm", + vocab_size=10, + model_type="WORD", + pad_id=-1, + unk_id=0, + bos_id=1, + eos_id=2, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_no_special_token_proto.py b/tools/sentencepiece_testing/create_no_special_token_proto.py new file mode 100644 index 0000000000..c13ef6e05a --- /dev/null +++ b/tools/sentencepiece_testing/create_no_special_token_proto.py @@ -0,0 +1,30 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["abc"], + "no_special_token_vocab.spm", + vocab_size=5, + pad_id=-1, + eos_id=-1, + bos_id=-1, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_sentence_piece_tokenizer_proto.py b/tools/sentencepiece_testing/create_sentence_piece_tokenizer_proto.py new file mode 100644 index 0000000000..a40eade848 --- /dev/null +++ b/tools/sentencepiece_testing/create_sentence_piece_tokenizer_proto.py @@ -0,0 +1,28 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox."], + "tokenizer_test_vocab.spm", + vocab_size=7, + model_type="WORD", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_t5_test_proto.py b/tools/sentencepiece_testing/create_t5_test_proto.py new file mode 100644 index 0000000000..b7e28160e5 --- /dev/null +++ b/tools/sentencepiece_testing/create_t5_test_proto.py @@ -0,0 +1,36 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "t5_test_vocab.spm", + vocab_size=11, + model_type="WORD", + bos_id=-1, + pad_id=0, + eos_id=1, + unk_id=2, + pad_piece="", + eos_piece="", + unk_piece="", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_xlm_roberta_test_proto.py b/tools/sentencepiece_testing/create_xlm_roberta_test_proto.py new file mode 100644 index 0000000000..988d161f99 --- /dev/null +++ b/tools/sentencepiece_testing/create_xlm_roberta_test_proto.py @@ -0,0 +1,37 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "xlm_roberta_test_vocab.spm", + vocab_size=12, + model_type="WORD", + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3, + pad_piece="", + unk_piece="", + bos_piece="[CLS]", + eos_piece="[SEP]", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/utils.py b/tools/sentencepiece_testing/utils.py new file mode 100644 index 0000000000..9deebd9737 --- /dev/null +++ b/tools/sentencepiece_testing/utils.py @@ -0,0 +1,33 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import io +import pathlib + +import sentencepiece + + +def train_sentencepiece(data, filename, *args, **kwargs): + bytes_io = io.BytesIO() + sentencepiece.SentencePieceTrainer.train( + sentence_iterator=iter(data), model_writer=bytes_io, *args, **kwargs + ) + with open( + pathlib.Path(__file__).parent.parent.parent + / "keras_nlp" + / "tests" + / "test_data" + / filename, + mode="wb", + ) as f: + f.write(bytes_io.getbuffer())