1717from paddle import nn
1818import paddle .nn .functional as F
1919from paddle .nn import LayerList
20- # from paddle.nn.initializer import XavierNormal as xavier_uniform_
2120from paddle .nn import Dropout , Linear , LayerNorm
2221import numpy as np
2322from ppocr .modeling .backbones .rec_svtrnet import Mlp , zeros_ , ones_
@@ -30,7 +29,6 @@ class Transformer(nn.Layer):
3029 Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
3130 Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
3231 Processing Systems, pages 6000-6010.
33-
3432 Args:
3533 d_model: the number of expected features in the encoder/decoder inputs (default=512).
3634 nhead: the number of heads in the multiheadattention models (default=8).
@@ -162,7 +160,7 @@ def forward_test(self, src):
162160 memory = src
163161 dec_seq = paddle .full ((bs , 1 ), 2 , dtype = paddle .int64 )
164162 dec_prob = paddle .full ((bs , 1 ), 1. , dtype = paddle .float32 )
165- for len_dec_seq in range (1 , self .max_len ):
163+ for len_dec_seq in range (1 , paddle . to_tensor ( self .max_len ) ):
166164 dec_seq_embed = self .embedding (dec_seq )
167165 dec_seq_embed = self .positional_encoding (dec_seq_embed )
168166 tgt_mask = self .generate_square_subsequent_mask (
@@ -304,7 +302,7 @@ def collect_hypothesis_and_scores(inst_dec_beams, n_best):
304302 inst_idx_to_position_map = get_inst_idx_to_tensor_position_map (
305303 active_inst_idx_list )
306304 # Decode
307- for len_dec_seq in range (1 , self .max_len ):
305+ for len_dec_seq in range (1 , paddle . to_tensor ( self .max_len ) ):
308306 src_enc_copy = src_enc .clone ()
309307 active_inst_idx_list = beam_decode_step (
310308 inst_dec_beams , len_dec_seq , src_enc_copy ,
@@ -348,15 +346,12 @@ class MultiheadAttention(nn.Layer):
348346 """Allows the model to jointly attend to information
349347 from different representation subspaces.
350348 See reference: Attention Is All You Need
351-
352349 .. math::
353350 \t ext{MultiHead}(Q, K, V) = \t ext{Concat}(head_1,\dots,head_h)W^O
354351 \t ext{where} head_i = \t ext{Attention}(QW_i^Q, KW_i^K, VW_i^V)
355-
356352 Args:
357353 embed_dim: total dimension of the model
358354 num_heads: parallel attention layers, or heads
359-
360355 """
361356
362357 def __init__ (self , embed_dim , num_heads , dropout = 0. , self_attn = False ):
0 commit comments