Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions unid2t/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data_preprocess/__pycache__/
.idea/
tools/__pycache__/
50 changes: 49 additions & 1 deletion unid2t/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,49 @@
The code is currently in the approval process, and the full version will be announced on subsequent dates as soon as possible.
[//]: # (#Unified Data-to-Text Pretraining)


## Unified Structured Data as Graph for Data-to-Text Pretraing

## Prepare Environment
You can create an environment for UniD2T and directly install python packages by commands:
```
pip install -r requirements.txt
```


## Data_preprocess
You can download the original data from the original website:
[ToTTo](https://github.com/google-research-datasets/ToTTo),
[CoSQL](https://yale-lily.github.io/cosql),
[WebNLG](https://gitlab.com/shimorina/webnlg-dataset/-/tree/master/release_v3.0),
[DART](https://github.com/Yale-LILY/DART),
[WikiBio](https://rlebret.github.io/wikipedia-biography-dataset/),
[WikiTableT](https://github.com/mingdachen/WikiTableT).

Then put it in the ```/orig_datasets/``` directory and use the code in ```/data_preprocess/``` to process each data. The processed data will be saved in cleanout_datasets, such as in totto dataset:
```
python /data_preprocess/totto/convert_totto_to_unified_graph.py
```

## Pretrain
Merge the data processed in the previous step:
```
python /data_preprocess/convert_totto_to_unified_graph.py
```
Pre-training on multiple GPUs:
```
torchrun \
--nproc_per_node=4 \
./pretrain.py \
--config /pretrain_config/**.yml
```


## Fintune
Fintune on single GPUs:
```
python finetune.py --config config/**.yml
```




Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# basic
seed: 42
device: 'cuda'
model_name: 't5'
datatype: 'linear'
enable_uda_relative_pos: False
# tokenizer_path: '/data/nt12_ssd_gluster/myself/pretrained_models/t5-small'
tokenizer_path: 't5-large'
special_token_path: '/root/data/cleanout_datasets/special_tokens.txt'
data_processor: 'linear'
# task_source_prefix: 'Describe the following data: '
modified_default_plm_config: True
plms_dropout_rate: 0.1

# training
train_type: 'finetune'
dist_train: False
experiment_name: 'finetuning_t5_base_on_cosql_2e-4'
init_model_path: 't5-large'
max_epochs: 80
max_steps: -1
early_stopping_patience: 8
start_eval_from: 0
eval_every: 1
max_keep_checkpoints: -1
report_every: 100
saved_dir: '/root/data/guanbao/finetuning/cosql_T5large_linear'

learner: fairseq_adafactor
learning_rate: 2e-04
adam_epsilon: 0.00000001
max_grad_norm: 2.0
lr_scheduler: 'none'
warmup_steps: 0

# training data
train_file_src: '/root/data/cleanout_datasets/cosql_with_unified_graph/cosql_train.json'
train_n_example: -1
train_batch_size: 16
max_source_length: 1024
max_target_length: -1
train_num_workers: 5


# evaluate data
eval_noise_data: False
val_metric: bleu
eval_file_src: '/root/data/cleanout_datasets/cosql_with_unified_graph/cosql_dev.json'
eval_n_example: -1
eval_batch_size: 32
num_beams: 5
eval_max_source_length: 1024
eval_max_target_length: 128
eval_num_workers: 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# basic
seed: 42
device: 'cuda'
model_name: 't5'
datatype: 'linear'
enable_uda_relative_pos: False
# tokenizer_path: '/data/nt12_ssd_gluster/myself/pretrained_models/t5-small'
tokenizer_path: 't5-large'
special_token_path: './cleanout_datasets/special_tokens.txt'
data_processor: 'linear'
# task_source_prefix: 'Describe the following data: '
modified_default_plm_config: True
plms_dropout_rate: 0.1

# training
train_type: 'finetune'
dist_train: False
experiment_name: 'finetuning_t5_base_on_dart_2e-4'
init_model_path: 't5-large'
max_epochs: 80
max_steps: -1
early_stopping_patience: 8
start_eval_from: 0
eval_every: 1
max_keep_checkpoints: -1
report_every: 100
saved_dir: '/root/data/guanbao/finetuning/dart_T5large_linear'

learner: fairseq_adafactor
learning_rate: 2e-04
adam_epsilon: 0.00000001
max_grad_norm: 2.0
lr_scheduler: 'none'
warmup_steps: 0

# training data
train_file_src: '/root/data/cleanout_datasets/dart/dart-v1.1.1-full-train_with_unified_graph_simplified_and_lower_relationt.json'
train_n_example: -1
train_batch_size: 16
max_source_length: 1024
max_target_length: -1
train_num_workers: 5


# evaluate data
eval_noise_data: False
val_metric: bleu
eval_file_src: '/root/data/cleanout_datasets/dart/dart-v1.1.1-full-dev_with_unified_graph_simplified_and_lower_relationt.json'
eval_n_example: -1
eval_batch_size: 32
num_beams: 5
eval_max_source_length: 1024
eval_max_target_length: 128
eval_num_workers: 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# basic
seed: 42
device: 'cuda'
model_name: 't5'
datatype: 'linear'
enable_uda_relative_pos: False
# tokenizer_path: '/data/nt12_ssd_gluster/myself/pretrained_models/t5-small'
tokenizer_path: 't5-large'
special_token_path: './cleanout_datasets/special_tokens.txt'
data_processor: 'linear'
# task_source_prefix: 'Describe the following data: '
modified_default_plm_config: True
plms_dropout_rate: 0.1

# training
train_type: 'finetune'
dist_train: False
experiment_name: 'finetuning_t5_base_on_totto_2e-4'
init_model_path: 't5-large'
max_epochs: 80
max_steps: -1
early_stopping_patience: 8
start_eval_from: 0
eval_every: 1
max_keep_checkpoints: -1
report_every: 100
saved_dir: '/root/data/guanbao/finetuning/totto_T5_large_linear'

learner: fairseq_adafactor
learning_rate: 2e-04
adam_epsilon: 0.00000001
max_grad_norm: 2.0
lr_scheduler: 'none'
warmup_steps: 0

# training data
train_file_src: '/root/data/cleanout_datasets/totto_with_unified_graph/totto_train_data.jsonl'
train_n_example: -1
train_batch_size: 16
max_source_length: 1024
max_target_length: -1
train_num_workers: 5


# evaluate data
eval_noise_data: False
val_metric: bleu
eval_file_src: '/root/data/cleanout_datasets/totto_with_unified_graph/totto_dev_data.jsonl'
eval_n_example: -1
eval_batch_size: 32
num_beams: 5
eval_max_source_length: 1024
eval_max_target_length: 128
eval_num_workers: 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# basic
seed: 42
device: 'cuda'
model_name: 't5'
datatype: 'linear'
enable_uda_relative_pos: False
# tokenizer_path: '/data/nt12_ssd_gluster/myself/pretrained_models/t5-small'
tokenizer_path: 't5-large'
special_token_path: '/root/data/liliang/experiments/UnifiedData2TextPretrain/cleanout_datasets/special_tokens.txt'
data_processor: 'linear'
# task_source_prefix: 'Describe the following data: '
modified_default_plm_config: True
plms_dropout_rate: 0.1

# training
train_type: 'finetune'
dist_train: False
experiment_name: 'finetuning_t5_large_on_webnlg17-4e-5'
init_model_path: 't5-large'
max_epochs: 10
max_steps: -1
early_stopping_patience: 8
start_eval_from: 0
eval_every: 1
max_keep_checkpoints: -1
report_every: 100
saved_dir: '/root/data/guanbao/finetuning/webnlg17_T5_large_linear'

learner: fairseq_adafactor
learning_rate: 4e-5
adam_epsilon: 0.00000001
max_grad_norm: 2.0
lr_scheduler: 'none'
warmup_steps: 0


# training data
train_file_src: '/root/data/cleanout_datasets/cleanout_webnlg17/train.json'
train_n_example: -1
train_batch_size: 16
max_source_length: 1024
max_target_length: -1
train_num_workers: 5


# evaluate data
eval_noise_data: False
val_metric: bleu
eval_file_src: '/root/data/cleanout_datasets/cleanout_webnlg17/test.json'
eval_n_example: -1
eval_batch_size: 32
num_beams: 5
eval_max_source_length: 1024
eval_max_target_length: 128
eval_num_workers: 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# basic
seed: 42
device: 'cuda'
model_name: 't5'
datatype: 'linear'
enable_uda_relative_pos: False
# tokenizer_path: '/data/nt12_ssd_gluster/myself/pretrained_models/t5-small'
tokenizer_path: 't5-large'
special_token_path: './cleanout_datasets/special_tokens.txt'
data_processor: 'linear'
# task_source_prefix: 'Describe the following data: '
modified_default_plm_config: True
plms_dropout_rate: 0.1

# training
train_type: 'finetune'
dist_train: False
experiment_name: 'finetuning_t5_base_on_wikibio_2e-4'
init_model_path: 't5-large'
max_epochs: 80
max_steps: -1
early_stopping_patience: 8
start_eval_from: 0
eval_every: 1
max_keep_checkpoints: -1
report_every: 100
saved_dir: '/root/data/guanbao/finetuning/wikibio_T5_large_linear'

learner: fairseq_adafactor
learning_rate: 2e-04
adam_epsilon: 0.00000001
max_grad_norm: 2.0
lr_scheduler: 'none'
warmup_steps: 0

# training data
train_file_src: '/root/data/cleanout_datasets/wikibio/train.json'
train_n_example: -1
train_batch_size: 16
max_source_length: 1024
max_target_length: -1
train_num_workers: 5


# evaluate data
eval_noise_data: False
val_metric: bleu
eval_file_src: '/root/data/cleanout_datasets/wikibio/test.json'
eval_n_example: -1
eval_batch_size: 32
num_beams: 5
eval_max_source_length: 1024
eval_max_target_length: 128
eval_num_workers: 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# basic
seed: 42
device: 'cuda'
model_name: 't5'
datatype: 'linear'
enable_uda_relative_pos: False
# tokenizer_path: '/data/nt12_ssd_gluster/myself/pretrained_models/t5-small'
tokenizer_path: 't5-large'
#special_token_path: '/root/data/cleanout_datasets/special_tokens.txt'
special_token_path: '/root/data/cleanout_datasets/special_tokens.txt'
data_processor: 'linear'
# task_source_prefix: 'Describe the following data: '
modified_default_plm_config: True
plms_dropout_rate: 0.1

# training
train_type: 'finetune'
dist_train: False
experiment_name: 'finetuning_t5_base_on_wikitableT_2e-4'
init_model_path: 't5-large'
max_epochs: 80
max_steps: -1
early_stopping_patience: 8
start_eval_from: 0
eval_every: 1
max_keep_checkpoints: -1
report_every: 100
saved_dir: '/root/data/guanbao/finetuning/wikitableT_T5_large_linear'

learner: fairseq_adafactor
learning_rate: 2e-04
adam_epsilon: 0.00000001
max_grad_norm: 2.0
lr_scheduler: 'none'
warmup_steps: 0

# training data
train_file_src: '/root/data/cleanout_datasets/WikitableT/train_udt.json'
train_n_example: -1
train_batch_size: 16
max_source_length: 512
max_target_length: 208
train_num_workers: 5


# evaluate data
eval_noise_data: False
val_metric: bleu
eval_file_src: '/root/data/cleanout_datasets/WikitableT/dev_udt.json'
eval_n_example: -1
eval_batch_size: 32
num_beams: 5
eval_max_source_length: 512
eval_max_target_length: 128
eval_num_workers: 5
Loading