123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709 |
- # copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- This code is refer from:
- https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/encoders/channel_reduction_encoder.py
- https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/decoders/robust_scanner_decoder.py
- """
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- import math
- import paddle
- from paddle import ParamAttr
- import paddle.nn as nn
- import paddle.nn.functional as F
- class BaseDecoder(nn.Layer):
- def __init__(self, **kwargs):
- super().__init__()
- def forward_train(self, feat, out_enc, targets, img_metas):
- raise NotImplementedError
- def forward_test(self, feat, out_enc, img_metas):
- raise NotImplementedError
- def forward(self,
- feat,
- out_enc,
- label=None,
- valid_ratios=None,
- word_positions=None,
- train_mode=True):
- self.train_mode = train_mode
- if train_mode:
- return self.forward_train(feat, out_enc, label, valid_ratios, word_positions)
- return self.forward_test(feat, out_enc, valid_ratios, word_positions)
- class ChannelReductionEncoder(nn.Layer):
- """Change the channel number with a one by one convoluational layer.
- Args:
- in_channels (int): Number of input channels.
- out_channels (int): Number of output channels.
- """
- def __init__(self,
- in_channels,
- out_channels,
- **kwargs):
- super(ChannelReductionEncoder, self).__init__()
- self.layer = nn.Conv2D(
- in_channels, out_channels, kernel_size=1, stride=1, padding=0, weight_attr=nn.initializer.XavierNormal())
- def forward(self, feat):
- """
- Args:
- feat (Tensor): Image features with the shape of
- :math:`(N, C_{in}, H, W)`.
- Returns:
- Tensor: A tensor of shape :math:`(N, C_{out}, H, W)`.
- """
- return self.layer(feat)
- def masked_fill(x, mask, value):
- y = paddle.full(x.shape, value, x.dtype)
- return paddle.where(mask, y, x)
- class DotProductAttentionLayer(nn.Layer):
- def __init__(self, dim_model=None):
- super().__init__()
- self.scale = dim_model**-0.5 if dim_model is not None else 1.
- def forward(self, query, key, value, h, w, valid_ratios=None):
- query = paddle.transpose(query, (0, 2, 1))
- logits = paddle.matmul(query, key) * self.scale
- n, c, t = logits.shape
- # reshape to (n, c, h, w)
- logits = paddle.reshape(logits, [n, c, h, w])
- if valid_ratios is not None:
- # cal mask of attention weight
- for i, valid_ratio in enumerate(valid_ratios):
- valid_width = min(w, int(w * valid_ratio + 0.5))
- if valid_width < w:
- logits[i, :, :, valid_width:] = float('-inf')
- # reshape to (n, c, h, w)
- logits = paddle.reshape(logits, [n, c, t])
- weights = F.softmax(logits, axis=2)
- value = paddle.transpose(value, (0, 2, 1))
- glimpse = paddle.matmul(weights, value)
- glimpse = paddle.transpose(glimpse, (0, 2, 1))
- return glimpse
- class SequenceAttentionDecoder(BaseDecoder):
- """Sequence attention decoder for RobustScanner.
- RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for
- Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_
- Args:
- num_classes (int): Number of output classes :math:`C`.
- rnn_layers (int): Number of RNN layers.
- dim_input (int): Dimension :math:`D_i` of input vector ``feat``.
- dim_model (int): Dimension :math:`D_m` of the model. Should also be the
- same as encoder output vector ``out_enc``.
- max_seq_len (int): Maximum output sequence length :math:`T`.
- start_idx (int): The index of `<SOS>`.
- mask (bool): Whether to mask input features according to
- ``img_meta['valid_ratio']``.
- padding_idx (int): The index of `<PAD>`.
- dropout (float): Dropout rate.
- return_feature (bool): Return feature or logits as the result.
- encode_value (bool): Whether to use the output of encoder ``out_enc``
- as `value` of attention layer. If False, the original feature
- ``feat`` will be used.
- Warning:
- This decoder will not predict the final class which is assumed to be
- `<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>`
- is also ignored by loss as specified in
- :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`.
- """
- def __init__(self,
- num_classes=None,
- rnn_layers=2,
- dim_input=512,
- dim_model=128,
- max_seq_len=40,
- start_idx=0,
- mask=True,
- padding_idx=None,
- dropout=0,
- return_feature=False,
- encode_value=False):
- super().__init__()
- self.num_classes = num_classes
- self.dim_input = dim_input
- self.dim_model = dim_model
- self.return_feature = return_feature
- self.encode_value = encode_value
- self.max_seq_len = max_seq_len
- self.start_idx = start_idx
- self.mask = mask
- self.embedding = nn.Embedding(
- self.num_classes, self.dim_model, padding_idx=padding_idx)
- self.sequence_layer = nn.LSTM(
- input_size=dim_model,
- hidden_size=dim_model,
- num_layers=rnn_layers,
- time_major=False,
- dropout=dropout)
- self.attention_layer = DotProductAttentionLayer()
- self.prediction = None
- if not self.return_feature:
- pred_num_classes = num_classes - 1
- self.prediction = nn.Linear(
- dim_model if encode_value else dim_input, pred_num_classes)
- def forward_train(self, feat, out_enc, targets, valid_ratios):
- """
- Args:
- feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
- out_enc (Tensor): Encoder output of shape
- :math:`(N, D_m, H, W)`.
- targets (Tensor): a tensor of shape :math:`(N, T)`. Each element is the index of a
- character.
- valid_ratios (Tensor): valid length ratio of img.
- Returns:
- Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if
- ``return_feature=False``. Otherwise it would be the hidden feature
- before the prediction projection layer, whose shape is
- :math:`(N, T, D_m)`.
- """
- tgt_embedding = self.embedding(targets)
- n, c_enc, h, w = out_enc.shape
- assert c_enc == self.dim_model
- _, c_feat, _, _ = feat.shape
- assert c_feat == self.dim_input
- _, len_q, c_q = tgt_embedding.shape
- assert c_q == self.dim_model
- assert len_q <= self.max_seq_len
- query, _ = self.sequence_layer(tgt_embedding)
- query = paddle.transpose(query, (0, 2, 1))
- key = paddle.reshape(out_enc, [n, c_enc, h * w])
- if self.encode_value:
- value = key
- else:
- value = paddle.reshape(feat, [n, c_feat, h * w])
- attn_out = self.attention_layer(query, key, value, h, w, valid_ratios)
- attn_out = paddle.transpose(attn_out, (0, 2, 1))
- if self.return_feature:
- return attn_out
- out = self.prediction(attn_out)
- return out
- def forward_test(self, feat, out_enc, valid_ratios):
- """
- Args:
- feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
- out_enc (Tensor): Encoder output of shape
- :math:`(N, D_m, H, W)`.
- valid_ratios (Tensor): valid length ratio of img.
- Returns:
- Tensor: The output logit sequence tensor of shape
- :math:`(N, T, C-1)`.
- """
- seq_len = self.max_seq_len
- batch_size = feat.shape[0]
- decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx)
- outputs = []
- for i in range(seq_len):
- step_out = self.forward_test_step(feat, out_enc, decode_sequence,
- i, valid_ratios)
- outputs.append(step_out)
- max_idx = paddle.argmax(step_out, axis=1, keepdim=False)
- if i < seq_len - 1:
- decode_sequence[:, i + 1] = max_idx
- outputs = paddle.stack(outputs, 1)
- return outputs
- def forward_test_step(self, feat, out_enc, decode_sequence, current_step,
- valid_ratios):
- """
- Args:
- feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
- out_enc (Tensor): Encoder output of shape
- :math:`(N, D_m, H, W)`.
- decode_sequence (Tensor): Shape :math:`(N, T)`. The tensor that
- stores history decoding result.
- current_step (int): Current decoding step.
- valid_ratios (Tensor): valid length ratio of img
- Returns:
- Tensor: Shape :math:`(N, C-1)`. The logit tensor of predicted
- tokens at current time step.
- """
-
- embed = self.embedding(decode_sequence)
- n, c_enc, h, w = out_enc.shape
- assert c_enc == self.dim_model
- _, c_feat, _, _ = feat.shape
- assert c_feat == self.dim_input
- _, _, c_q = embed.shape
- assert c_q == self.dim_model
- query, _ = self.sequence_layer(embed)
- query = paddle.transpose(query, (0, 2, 1))
- key = paddle.reshape(out_enc, [n, c_enc, h * w])
- if self.encode_value:
- value = key
- else:
- value = paddle.reshape(feat, [n, c_feat, h * w])
- # [n, c, l]
- attn_out = self.attention_layer(query, key, value, h, w, valid_ratios)
- out = attn_out[:, :, current_step]
- if self.return_feature:
- return out
- out = self.prediction(out)
- out = F.softmax(out, dim=-1)
- return out
- class PositionAwareLayer(nn.Layer):
- def __init__(self, dim_model, rnn_layers=2):
- super().__init__()
- self.dim_model = dim_model
- self.rnn = nn.LSTM(
- input_size=dim_model,
- hidden_size=dim_model,
- num_layers=rnn_layers,
- time_major=False)
- self.mixer = nn.Sequential(
- nn.Conv2D(
- dim_model, dim_model, kernel_size=3, stride=1, padding=1),
- nn.ReLU(),
- nn.Conv2D(
- dim_model, dim_model, kernel_size=3, stride=1, padding=1))
- def forward(self, img_feature):
- n, c, h, w = img_feature.shape
- rnn_input = paddle.transpose(img_feature, (0, 2, 3, 1))
- rnn_input = paddle.reshape(rnn_input, (n * h, w, c))
- rnn_output, _ = self.rnn(rnn_input)
- rnn_output = paddle.reshape(rnn_output, (n, h, w, c))
- rnn_output = paddle.transpose(rnn_output, (0, 3, 1, 2))
- out = self.mixer(rnn_output)
- return out
- class PositionAttentionDecoder(BaseDecoder):
- """Position attention decoder for RobustScanner.
- RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for
- Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_
- Args:
- num_classes (int): Number of output classes :math:`C`.
- rnn_layers (int): Number of RNN layers.
- dim_input (int): Dimension :math:`D_i` of input vector ``feat``.
- dim_model (int): Dimension :math:`D_m` of the model. Should also be the
- same as encoder output vector ``out_enc``.
- max_seq_len (int): Maximum output sequence length :math:`T`.
- mask (bool): Whether to mask input features according to
- ``img_meta['valid_ratio']``.
- return_feature (bool): Return feature or logits as the result.
- encode_value (bool): Whether to use the output of encoder ``out_enc``
- as `value` of attention layer. If False, the original feature
- ``feat`` will be used.
- Warning:
- This decoder will not predict the final class which is assumed to be
- `<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>`
- is also ignored by loss
-
- """
- def __init__(self,
- num_classes=None,
- rnn_layers=2,
- dim_input=512,
- dim_model=128,
- max_seq_len=40,
- mask=True,
- return_feature=False,
- encode_value=False):
- super().__init__()
- self.num_classes = num_classes
- self.dim_input = dim_input
- self.dim_model = dim_model
- self.max_seq_len = max_seq_len
- self.return_feature = return_feature
- self.encode_value = encode_value
- self.mask = mask
- self.embedding = nn.Embedding(self.max_seq_len + 1, self.dim_model)
- self.position_aware_module = PositionAwareLayer(
- self.dim_model, rnn_layers)
- self.attention_layer = DotProductAttentionLayer()
- self.prediction = None
- if not self.return_feature:
- pred_num_classes = num_classes - 1
- self.prediction = nn.Linear(
- dim_model if encode_value else dim_input, pred_num_classes)
- def _get_position_index(self, length, batch_size):
- position_index_list = []
- for i in range(batch_size):
- position_index = paddle.arange(0, end=length, step=1, dtype='int64')
- position_index_list.append(position_index)
- batch_position_index = paddle.stack(position_index_list, axis=0)
- return batch_position_index
- def forward_train(self, feat, out_enc, targets, valid_ratios, position_index):
- """
- Args:
- feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
- out_enc (Tensor): Encoder output of shape
- :math:`(N, D_m, H, W)`.
- targets (dict): A dict with the key ``padded_targets``, a
- tensor of shape :math:`(N, T)`. Each element is the index of a
- character.
- valid_ratios (Tensor): valid length ratio of img.
- position_index (Tensor): The position of each word.
- Returns:
- Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if
- ``return_feature=False``. Otherwise it will be the hidden feature
- before the prediction projection layer, whose shape is
- :math:`(N, T, D_m)`.
- """
- n, c_enc, h, w = out_enc.shape
- assert c_enc == self.dim_model
- _, c_feat, _, _ = feat.shape
- assert c_feat == self.dim_input
- _, len_q = targets.shape
- assert len_q <= self.max_seq_len
-
- position_out_enc = self.position_aware_module(out_enc)
- query = self.embedding(position_index)
- query = paddle.transpose(query, (0, 2, 1))
- key = paddle.reshape(position_out_enc, (n, c_enc, h * w))
- if self.encode_value:
- value = paddle.reshape(out_enc,(n, c_enc, h * w))
- else:
- value = paddle.reshape(feat,(n, c_feat, h * w))
- attn_out = self.attention_layer(query, key, value, h, w, valid_ratios)
- attn_out = paddle.transpose(attn_out, (0, 2, 1)) # [n, len_q, dim_v]
- if self.return_feature:
- return attn_out
- return self.prediction(attn_out)
- def forward_test(self, feat, out_enc, valid_ratios, position_index):
- """
- Args:
- feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
- out_enc (Tensor): Encoder output of shape
- :math:`(N, D_m, H, W)`.
- valid_ratios (Tensor): valid length ratio of img
- position_index (Tensor): The position of each word.
- Returns:
- Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if
- ``return_feature=False``. Otherwise it would be the hidden feature
- before the prediction projection layer, whose shape is
- :math:`(N, T, D_m)`.
- """
- n, c_enc, h, w = out_enc.shape
- assert c_enc == self.dim_model
- _, c_feat, _, _ = feat.shape
- assert c_feat == self.dim_input
- position_out_enc = self.position_aware_module(out_enc)
-
- query = self.embedding(position_index)
- query = paddle.transpose(query, (0, 2, 1))
- key = paddle.reshape(position_out_enc, (n, c_enc, h * w))
- if self.encode_value:
- value = paddle.reshape(out_enc,(n, c_enc, h * w))
- else:
- value = paddle.reshape(feat,(n, c_feat, h * w))
- attn_out = self.attention_layer(query, key, value, h, w, valid_ratios)
- attn_out = paddle.transpose(attn_out, (0, 2, 1)) # [n, len_q, dim_v]
- if self.return_feature:
- return attn_out
- return self.prediction(attn_out)
- class RobustScannerFusionLayer(nn.Layer):
- def __init__(self, dim_model, dim=-1):
- super(RobustScannerFusionLayer, self).__init__()
- self.dim_model = dim_model
- self.dim = dim
- self.linear_layer = nn.Linear(dim_model * 2, dim_model * 2)
- def forward(self, x0, x1):
- assert x0.shape == x1.shape
- fusion_input = paddle.concat([x0, x1], self.dim)
- output = self.linear_layer(fusion_input)
- output = F.glu(output, self.dim)
- return output
- class RobustScannerDecoder(BaseDecoder):
- """Decoder for RobustScanner.
- RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for
- Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_
- Args:
- num_classes (int): Number of output classes :math:`C`.
- dim_input (int): Dimension :math:`D_i` of input vector ``feat``.
- dim_model (int): Dimension :math:`D_m` of the model. Should also be the
- same as encoder output vector ``out_enc``.
- max_seq_len (int): Maximum output sequence length :math:`T`.
- start_idx (int): The index of `<SOS>`.
- mask (bool): Whether to mask input features according to
- ``img_meta['valid_ratio']``.
- padding_idx (int): The index of `<PAD>`.
- encode_value (bool): Whether to use the output of encoder ``out_enc``
- as `value` of attention layer. If False, the original feature
- ``feat`` will be used.
- Warning:
- This decoder will not predict the final class which is assumed to be
- `<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>`
- is also ignored by loss as specified in
- :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`.
- """
- def __init__(self,
- num_classes=None,
- dim_input=512,
- dim_model=128,
- hybrid_decoder_rnn_layers=2,
- hybrid_decoder_dropout=0,
- position_decoder_rnn_layers=2,
- max_seq_len=40,
- start_idx=0,
- mask=True,
- padding_idx=None,
- encode_value=False):
- super().__init__()
- self.num_classes = num_classes
- self.dim_input = dim_input
- self.dim_model = dim_model
- self.max_seq_len = max_seq_len
- self.encode_value = encode_value
- self.start_idx = start_idx
- self.padding_idx = padding_idx
- self.mask = mask
- # init hybrid decoder
- self.hybrid_decoder = SequenceAttentionDecoder(
- num_classes=num_classes,
- rnn_layers=hybrid_decoder_rnn_layers,
- dim_input=dim_input,
- dim_model=dim_model,
- max_seq_len=max_seq_len,
- start_idx=start_idx,
- mask=mask,
- padding_idx=padding_idx,
- dropout=hybrid_decoder_dropout,
- encode_value=encode_value,
- return_feature=True
- )
- # init position decoder
- self.position_decoder = PositionAttentionDecoder(
- num_classes=num_classes,
- rnn_layers=position_decoder_rnn_layers,
- dim_input=dim_input,
- dim_model=dim_model,
- max_seq_len=max_seq_len,
- mask=mask,
- encode_value=encode_value,
- return_feature=True
- )
- self.fusion_module = RobustScannerFusionLayer(
- self.dim_model if encode_value else dim_input)
- pred_num_classes = num_classes - 1
- self.prediction = nn.Linear(dim_model if encode_value else dim_input,
- pred_num_classes)
- def forward_train(self, feat, out_enc, target, valid_ratios, word_positions):
- """
- Args:
- feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
- out_enc (Tensor): Encoder output of shape
- :math:`(N, D_m, H, W)`.
- target (dict): A dict with the key ``padded_targets``, a
- tensor of shape :math:`(N, T)`. Each element is the index of a
- character.
- valid_ratios (Tensor):
- word_positions (Tensor): The position of each word.
- Returns:
- Tensor: A raw logit tensor of shape :math:`(N, T, C-1)`.
- """
- hybrid_glimpse = self.hybrid_decoder.forward_train(
- feat, out_enc, target, valid_ratios)
- position_glimpse = self.position_decoder.forward_train(
- feat, out_enc, target, valid_ratios, word_positions)
- fusion_out = self.fusion_module(hybrid_glimpse, position_glimpse)
- out = self.prediction(fusion_out)
- return out
- def forward_test(self, feat, out_enc, valid_ratios, word_positions):
- """
- Args:
- feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`.
- out_enc (Tensor): Encoder output of shape
- :math:`(N, D_m, H, W)`.
- valid_ratios (Tensor):
- word_positions (Tensor): The position of each word.
- Returns:
- Tensor: The output logit sequence tensor of shape
- :math:`(N, T, C-1)`.
- """
- seq_len = self.max_seq_len
- batch_size = feat.shape[0]
- decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx)
- position_glimpse = self.position_decoder.forward_test(
- feat, out_enc, valid_ratios, word_positions)
- outputs = []
- for i in range(seq_len):
- hybrid_glimpse_step = self.hybrid_decoder.forward_test_step(
- feat, out_enc, decode_sequence, i, valid_ratios)
- fusion_out = self.fusion_module(hybrid_glimpse_step,
- position_glimpse[:, i, :])
- char_out = self.prediction(fusion_out)
- char_out = F.softmax(char_out, -1)
- outputs.append(char_out)
- max_idx = paddle.argmax(char_out, axis=1, keepdim=False)
- if i < seq_len - 1:
- decode_sequence[:, i + 1] = max_idx
- outputs = paddle.stack(outputs, 1)
- return outputs
- class RobustScannerHead(nn.Layer):
- def __init__(self,
- out_channels, # 90 + unknown + start + padding
- in_channels,
- enc_outchannles=128,
- hybrid_dec_rnn_layers=2,
- hybrid_dec_dropout=0,
- position_dec_rnn_layers=2,
- start_idx=0,
- max_text_length=40,
- mask=True,
- padding_idx=None,
- encode_value=False,
- **kwargs):
- super(RobustScannerHead, self).__init__()
- # encoder module
- self.encoder = ChannelReductionEncoder(
- in_channels=in_channels, out_channels=enc_outchannles)
- # decoder module
- self.decoder =RobustScannerDecoder(
- num_classes=out_channels,
- dim_input=in_channels,
- dim_model=enc_outchannles,
- hybrid_decoder_rnn_layers=hybrid_dec_rnn_layers,
- hybrid_decoder_dropout=hybrid_dec_dropout,
- position_decoder_rnn_layers=position_dec_rnn_layers,
- max_seq_len=max_text_length,
- start_idx=start_idx,
- mask=mask,
- padding_idx=padding_idx,
- encode_value=encode_value)
- def forward(self, inputs, targets=None):
- '''
- targets: [label, valid_ratio, word_positions]
- '''
- out_enc = self.encoder(inputs)
- valid_ratios = None
- word_positions = targets[-1]
- if len(targets) > 1:
- valid_ratios = targets[-2]
-
- if self.training:
- label = targets[0] # label
- label = paddle.to_tensor(label, dtype='int64')
- final_out = self.decoder(
- inputs, out_enc, label, valid_ratios, word_positions)
- if not self.training:
- final_out = self.decoder(
- inputs,
- out_enc,
- label=None,
- valid_ratios=valid_ratios,
- word_positions=word_positions,
- train_mode=False)
- return final_out
|