Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
del write json in dataset.py
  • Loading branch information
CPFLAME committed Aug 26, 2022
commit 057e6e5fc73db23f4867ed24a35ec323d36b58e0
8 changes: 4 additions & 4 deletions projects/RWKV_v4/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ def __init__(self, data_dir, ctx_len, epoch_length_fixed):
for u in unique:
xxObj[xx] = u
xx += 1
with open("vocab.json", "w", encoding="utf-16") as vocab_file:
vocab_file.write(json.dumps(xxObj, ensure_ascii=False))
# NOTE: comment out write json file.
# with open("vocab.json", "w", encoding="utf-16") as vocab_file:
# vocab_file.write(json.dumps(xxObj, ensure_ascii=False))

data_size, vocab_size = len(data), len(unique)
print("data has %d tokens, %d unique." % (data_size, vocab_size))
Expand All @@ -37,8 +38,7 @@ def __len__(self):

def __getitem__(self, idx):
# cheat: pick a random spot in dataset
# i = np.random.randint(0, len(self.data) - (self.ctx_len + 1))
i = 1
i = np.random.randint(0, len(self.data) - (self.ctx_len + 1))
chunk = self.data[i : i + self.ctx_len + 1]
dix = [self.stoi[s] for s in chunk]
x = flow.tensor(dix[:-1], dtype=flow.long)
Expand Down