voidful · oscar-shih · May 6, 2023 · May 6, 2023 · May 6, 2023 · May 6, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,268 @@
+### Example user template template
+### Example user template
+
+# IntelliJ project files
+.idea
+*.iml
+out
+gen
+### macOS template
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Folder
+Hubert
+train_dev_pred*
+asrp
+nlg-eval
+whisper-ft
+asr-trainer
+wandb
+# Output files
+*.txt
+*.pt
+*.arrow
+nmsqa-*
+hubert_*
+g_00500000
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+conferences.yml
+config.json
+how2
+backup
diff --git a/causal_train.py b/causal_train.py
@@ -0,0 +1,55 @@
+import json
+import math
+
+import numpy as np
+from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
+from transformers import Trainer
+from transformers import TrainingArguments
+
+from module.data_processing import get_train_valid_dataset
+from module.eval_metric import compute_metrics_fn
+
+# Load model and tokenizer and Set training parameters
+model = AutoModelForCausalLM.from_pretrained(
+    "voidful/stablelm-tuned-alpha-3b-unit")
+tokenizer = AutoTokenizer.from_pretrained(
+    "voidful/stablelm-tuned-alpha-3b-unit")
+
+training_args = TrainingArguments(
+    output_dir="./training_output/stablelm-tuned-alpha-3b-unit",
+    num_train_epochs=10,
+    per_device_train_batch_size=1,
+    per_device_eval_batch_size=1,
+    warmup_steps=500,
+    weight_decay=3e-3,
+    logging_dir="./logs",
+    logging_steps=10,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    save_total_limit=2,
+    learning_rate=5e-4,
+    fp16=True,
+    gradient_accumulation_steps=8,
+)
+
+# Load dataset
+train_dataset, valid_dataset = get_train_valid_dataset(training_args,
+                                                       tokenizer, model.config)
+# Initialize Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=valid_dataset,
+    tokenizer=tokenizer,
+    compute_metrics=compute_metrics_fn,
+    # data_collator=data_collator,
+    # prediction_loss_only=True,
+    # post_process_function=preprocess_logits_for_metrics
+)
+# Train model
+trainer.train()
+# Evaluate model
+eval_results = trainer.evaluate()
+print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
diff --git a/create_nmsqacsv.py b/create_nmsqacsv.py
@@ -0,0 +1,44 @@
+import os
+
+import datasets
+import IPython.display as ipd
+from tqdm import tqdm
+
+train_set = datasets.load_dataset("voidful/NMSQA_audio", split="train")
+dev_set = datasets.load_dataset("voidful/NMSQA_audio", split="dev")
+train_dir, dev_dir = "./NMSQA-train-wav", "./NMSQA-dev-wav"
+os.makedirs(train_dir, exist_ok=True)
+os.makedirs(dev_dir, exist_ok=True)
+with open("NMSQA-train.csv", "w") as f:
+    f.writelines("path,text\n")
+with open("NMSQA-dev.csv", "w") as f:
+    f.writelines("path,text\n")
+
+for data in tqdm(train_set):
+    try:
+        audio_data = data["content_segment_audio_path"]["array"]
+        audio = ipd.Audio(data=audio_data, autoplay=False, rate=22050)
+    except:
+        continue
+    gt_text = data["content_segment_normalized_text"].replace('"', "'")
+    ids = data["id"]
+    file_name = ids + ".wav"
+    with open(os.path.join(train_dir, file_name), "wb") as f:
+        f.write(audio.data)
+    with open("NMSQA-train.csv", "a") as f:
+        f.writelines(
+            os.path.join(train_dir, file_name) + ',"' + gt_text + '"\n')
+
+for data in tqdm(dev_set):
+    try:
+        audio_data = data["content_segment_audio_path"]["array"]
+        audio = ipd.Audio(data=audio_data, autoplay=False, rate=22050)
+    except:
+        continue
+    gt_text = data["content_segment_normalized_text"].replace('"', "'")
+    ids = data["id"]
+    file_name = ids + ".wav"
+    with open(os.path.join(dev_dir, file_name), "wb") as f:
+        f.write(audio.data)
+    with open("NMSQA-dev.csv", "a") as f:
+        f.writelines(os.path.join(dev_dir, file_name) + ',"' + gt_text + '"\n')