Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
prepare dataset for fine-tuning whisper
  • Loading branch information
oscar-shih authored Jun 20, 2023
commit 603640d1b4c1374501ba5b889912407eaa4089e2
48 changes: 48 additions & 0 deletions create_nmsqacsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import wave, struct
import numpy as np
import datasets
import transformers
import os
import pyarrow.parquet as pq
from scipy.io.wavfile import write, read
from datasets import Audio
import IPython.display as ipd

dataset = datasets.load_dataset("voidful/NMSQA_audio")
train_set = dataset['train']
dev_set = dataset['dev']
train_dir, dev_dir = "./NMSQA-train-wav", "./NMSQA-dev-wav"
os.makedirs(train_dir, exist_ok=True)
os.makedirs(dev_dir, exist_ok=True)
with open("NMSQA-train.csv", "w") as f:
f.writelines("path,text\n")
with open("NMSQA-dev.csv", "w") as f:
f.writelines("path,text\n")

for data in train_set:
try:
audio_data = data['content_segment_audio_path']['array']
audio = ipd.Audio(data=audio_data, autoplay=False, rate=24000)
except:
continue
gt_text = data['content_segment_normalized_text']
ids = data['id']
file_name = ids + '.wav'
with open(os.path.join(train_dir, file_name), "wb") as f:
f.write(audio.data)
with open("NMSQA-train.csv", "a") as f:
f.writelines(os.path.join(train_dir, file_name) + ',' + gt_text + "\n")

for data in dev_set:
try:
audio_data = data['content_segment_audio_path']['array']
audio = ipd.Audio(data=audio_data, autoplay=False, rate=24000)
except:
continue
gt_text = data['content_segment_normalized_text']
ids = data['id']
file_name = ids + '.wav'
with open(os.path.join(dev_dir, file_name), "wb") as f:
f.write(audio.data)
with open("NMSQA-dev.csv", "a") as f:
f.writelines(os.path.join(dev_dir, file_name) + ',' + gt_text + "\n")