| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- # Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import random
- import h5py
- import numpy as np
- import paddle
- from paddle.io import DataLoader, Dataset
- from utils.collate import Stack
- def create_pretraining_dataset(args,
- input_file,
- data_holders,
- worker_init=None,
- places=None):
- train_data = PretrainingDataset(
- input_file=input_file, max_pred_length=args.max_predictions_per_seq)
- train_batch_sampler = paddle.io.BatchSampler(
- train_data, batch_size=args.batch_size, shuffle=True)
- def _collate_data(data, stack_fn=Stack()):
- num_fields = len(data[0])
- out = [None] * num_fields
- [
- input_ids, segment_ids, input_mask, masked_lm_positions,
- masked_lm_labels, next_sentence_labels, masked_lm_scale
- ] = [0, 1, 2, 3, 4, 5, 6]
- for i in (input_ids, segment_ids, input_mask, next_sentence_labels):
- out[i] = stack_fn([x[i] for x in data])
- _, seq_length = out[input_ids].shape
- size = sum(len(x[masked_lm_positions]) for x in data)
- if size % 8 != 0:
- size += 8 - (size % 8)
- out[masked_lm_positions] = np.full(size, 0, dtype=np.int32)
- out[masked_lm_labels] = np.full([size, 1], -1, dtype=np.int64)
- mask_token_num = 0
- for i, x in enumerate(data):
- for j, pos in enumerate(x[masked_lm_positions]):
- out[masked_lm_positions][mask_token_num] = i * seq_length + pos
- out[masked_lm_labels][mask_token_num] = x[masked_lm_labels][j]
- mask_token_num += 1
- # The value of masked_lm_scale is equal to mask_token_num,
- # which would be used to compute average masked_lm_loss.
- out.append(np.asarray([mask_token_num], dtype=np.float32))
- if args.amp and args.use_pure_fp16:
- #out[input_mask] = out[input_mask].astype(np.float16)
- out[masked_lm_scale] = out[masked_lm_scale].astype(np.float16)
- return out
- train_data_loader = DataLoader(
- dataset=train_data,
- places=places,
- feed_list=data_holders,
- batch_sampler=train_batch_sampler,
- collate_fn=_collate_data,
- num_workers=0,
- worker_init_fn=worker_init,
- return_list=False)
- return train_data_loader
- def create_pretraining_data_holder():
- input_ids = paddle.static.data(
- name="input_ids", shape=[-1, -1], dtype="int64")
- segment_ids = paddle.static.data(
- name="segment_ids", shape=[-1, -1], dtype="int64")
- input_mask = paddle.static.data(
- name="input_mask", shape=[-1, 1, 1, -1], dtype="int64")
- masked_lm_positions = paddle.static.data(
- name="masked_lm_positions", shape=[-1], dtype="int32")
- masked_lm_labels = paddle.static.data(
- name="masked_lm_labels", shape=[-1, 1], dtype="int64")
- next_sentence_labels = paddle.static.data(
- name="next_sentence_labels", shape=[-1, 1], dtype="int64")
- masked_lm_scale = paddle.static.data(
- name="masked_lm_scale", shape=[-1, 1], dtype="float32")
- return [
- input_ids, segment_ids, input_mask, masked_lm_positions,
- masked_lm_labels, next_sentence_labels, masked_lm_scale
- ]
- def select_dataset_file_for_each_worker(files, f_start_id, num_trainers,
- trainer_id):
- """
- Spliting the train file according to the worker index.
- """
- num_files = len(files)
- if num_trainers > num_files:
- remainder = num_trainers % num_files
- data_file = files[(
- f_start_id * num_trainers + trainer_id + remainder * f_start_id) %
- num_files]
- else:
- data_file = files[(f_start_id * num_trainers + trainer_id) % num_files]
- return data_file
- class WorkerInitObj:
- "Construct the object with different seed, and the Dataloader will generate the data "
- "with different seed in each worker."
- def __init__(self, seed):
- self.seed = seed
- def __call__(self, pid):
- np.random.seed(seed=self.seed + pid)
- random.seed(self.seed + pid)
- class PretrainingDataset(Dataset):
- def __init__(self, input_file, max_pred_length):
- self.input_file = input_file
- self.max_pred_length = max_pred_length
- f = h5py.File(input_file, "r")
- keys = [
- 'input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions',
- 'masked_lm_ids', 'next_sentence_labels'
- ]
- self.inputs = [np.asarray(f[key][:]) for key in keys]
- f.close()
- def __len__(self):
- 'Denotes the total number of samples'
- return len(self.inputs[0])
- def __getitem__(self, index):
- # convert next_sentence_labels (index=5) to np.ndarray type
- [
- input_ids, input_mask, segment_ids, masked_lm_positions,
- masked_lm_ids, next_sentence_labels
- ] = [
- input[index].astype(np.int64)
- if indice < 5 else np.asarray(input[index].astype(np.int64))
- for indice, input in enumerate(self.inputs)
- ]
- # input_mask = (1 - np.reshape(
- # input_mask.astype(np.float32), [1, 1, input_mask.shape[0]])) * -1e4
- input_mask = np.reshape(input_mask, [1, 1, input_mask.shape[0]])
- index = self.max_pred_length
- padded_mask_indices = (masked_lm_positions == 0).nonzero()[0]
- if len(padded_mask_indices) != 0:
- index = padded_mask_indices[0].item()
- else:
- index = self.max_pred_length
- masked_lm_labels = masked_lm_ids[:index]
- masked_lm_positions = masked_lm_positions[:index]
- # softmax_with_cross_entropy enforce last dim size equal 1
- masked_lm_labels = np.expand_dims(masked_lm_labels, axis=-1)
- next_sentence_labels = np.expand_dims(next_sentence_labels, axis=-1)
- return [
- input_ids, segment_ids, input_mask, masked_lm_positions,
- masked_lm_labels, next_sentence_labels
- ]
|