DL

Seq2Seq模型:基于注意力机制的解码器

Seq2Seq模型:基于注意力机制的解码器

Posted by xuepro on September 8, 2020

这是我书中的本来的代码,但写了另外一个注意力机制代码,这段代码要删除,这里保存一下,供大家参考。注意:解码器代码中有一个重大bug,我不透露,谁能发现?

希望和搞懂原理,请看我的著作《解剖深度学习原理-从0编写深度学习库》。没有比我这本书更适合小白学习的深度学习书了,那些大师大牛们的书,普通人看起来费劲,他们总是以为读者已经是内行了。其实很多人都是小白。而且他们喜欢将简单的东西复杂化,故弄玄虚。我这书如同庖丁解牛,详细破解,让人一下子豁然开朗,原来深度学习竟然如此简单! 就是这么简单! 我这么手把手分解给读者,这样的内容,你其他书上找不到或者说这样的书极少(除一个日本人和一个美国人写的,但他们的书写的很不全,只是开头,不全面)。这本书出版社因为疫情,出版社从5月份一直推迟到了9月份。期间我没事看看,还发现了一些问题,又修改反馈给出版社。

另外原著要1000页,后来出版社说要拆分成2本出版,即pytorch和强化学习部分被删除了,将以另外一本电子书形式出版(纸质书不赚钱,一般书也就是5000元左右,性价比太低太低)。谁告诉我哪个平台可以出版电子书呢?

基于嵌入层的编码器

from rnn import *
from Layers import *
from train import *

class EncoderRNN_Embed(object):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size,self.hidden_size = input_size,hidden_size   
        #print(input_size, hidden_size)
        self.embedding = Embedding(input_size, hidden_size)
        self.gru = GRU(hidden_size, hidden_size,1)

    def forward(self, input, hidden):
        #(1,1,input_size)
        #input = vecterization(input_verb,input)
        #output, hidden = self.gru(input, hidden)
        
        self.embedded_x = []
        self.embedded_out = []
        embed_out = []
        for x in input:              
            embedded = self.embedding(x).reshape(1,1,-1)
            self.embedded_x.append(self.embedding.x)
            self.embedded_out.append( embedded)
            
        self.embedded_out = np.concatenate(self.embedded_out,axis=0) 
        output, hidden = self.gru(self.embedded_out, hidden)
        return output, hidden

    def __call__(self,input, hidden):
        return self.forward(input, hidden)
    
    def initHidden(self):
        return np.zeros((1, 1, self.hidden_size))
    
    def parameters(self):
        return self.gru.parameters()
 
    def backward(self,dhs): 
        dinput,dhidden = self.gru.backward(dhs,self.embedded_out)
        T = dinput.shape[0]
        for t in range(T):
            dinput_t = dinput[t]
            self.embedding.x = self.embedded_x[t]  ## recover the original x when do forward
            #self.embedding.backward(d_embeded)     
            self.embedding.backward(dinput_t)

基于注意力机制的解码器

按照pytorch教程的图从0编写的编码器

from Layers import *
from rnn import *
import util
    
class  DecoderRNN_Atten(object):
    def __init__(self, hidden_size, output_size,num_layers=1,teacher_forcing_ratio = 0.5,dropout_p=0.1, \
                 max_length=MAX_LENGTH):
        super(DecoderRNN_Atten, self).__init__()
       
        self.hidden_size = hidden_size       
        self.num_layers = 1
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = Embedding(output_size, hidden_size)
        self.dropout = Dropout(self.dropout_p)
                
        self.attn = Dense(self.hidden_size * 2, self.max_length)
        #self.attn = Atten(hidden_size)    
        self.attn_combine = Dense(self.hidden_size * 2, self.hidden_size)        
        self.relu = Relu()
        
        self.gru = GRU(hidden_size, hidden_size,1)
        self.out = Dense(hidden_size, output_size)
        
        self.layers = [self.embedding,self.attn,self.attn_combine,self.gru,self.out]
        self._params = None
        self.use_dropout = False

    def initHidden(self,batch_size):
        # This is what we'll initialise our hidden state as
        self.h_0 =  np.zeros((self.num_layers, batch_size, self.hidden_size))    
    
    def forward_step(self, input, prev_hidden,encoder_outputs,training=True):
        embedded = self.embedding(input)  #(B,D))
        self.embedded_x.append(self.embedding.x) 
        
        if self.use_dropout and training:
            embedded = self.dropout(embedded,training)
            self.dropout_mask.append(self.dropout._mask)        
             
        attn_out = self.attn(np.concatenate((embedded, prev_hidden[0]),axis=1))
        self.attn_x.append(self.attn.x)
        #context,alphas,energies = self.attn(hidden, encoder_outputs)  
        #self.attn_x.append((context,alphas,energies,hidden))               
        
        attn_weights  = util.softmax(attn_out)        
        self.attn_weights_seq.append(attn_weights)
        
        attn_applied  = bmm(attn_weights,encoder_outputs)
        
        attn_combine_out = self.attn_combine(np.concatenate((embedded, attn_applied),axis=1))
        self.attn_combine_x.append(self.attn_combine.x)
          
        output = self.relu(attn_combine_out)   
        self.relu_x.append(self.relu.x)  
        
        relu_out = output.reshape(1,output.shape[0],-1)             
        self.gru_x.append(relu_out)  #output)  # input of gru          
        output_hs, hidden = self.gru(relu_out,prev_hidden) 
        self.gru_hs.append(self.gru.hs) #保持中间层的隐状态
        self.gru_zs.append(self.gru.zs) #保持中间层的计算结果 
            
        output_hs_t = output_hs[0]#seq_len = 1
        output = self.out(output_hs_t) 
        self.out_x.append(self.out.x)
        return output,hidden,output_hs_t
        
    def forward(self,input_tensor,encoder_outputs): #hidden,encoder_outputs): 
        self.encoder_outputs = encoder_outputs  #(T,B,D)
        self.attn_weights_seq = []
        target_length = input_tensor.shape[0] #nput_tensor.size(0)
        
        teacher_forcing_ratio = self.teacher_forcing_ratio
        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        
        hidden_t = encoder_outputs[-1].reshape(1,encoder_outputs[-1].shape[0],encoder_outputs[-1].shape[1])
        h_0 = hidden_t.copy()
        input_t = np.array([SOS_token])
           
        output_hs = []
        output = []  
        self.gru_x = [] #gru input       
        self.gru_hs = []
        self.gru_zs = []
        self.dropout_mask = []
        self.embedded_x = []
        self.relu_x = []
        self.attn_x = []
        self.attn_combine_x = []
        self.attn_weights_seq = []
        self.out_x = []
        
        encoder_outputs = np.pad(self.encoder_outputs,((0,self.max_length-self.encoder_outputs.shape[0]),(0,0),(0,0)), 'constant')
        for t in range(target_length):
            output_t, hidden_t,output_hs_t = self.forward_step(input_t, hidden_t,encoder_outputs)
            output_hs.append(output_hs_t)
            output.append(output_t)
           
            if use_teacher_forcing:
                input_t = input_tensor[t]  # Teacher forcing
            else:             
                input_t = np.argmax(output_t)  #最大概率
                if input_t== EOS_token:
                    break  
                input_t = np.array([input_t])
                
        output = np.array(output)
        self.output_hs = np.array(output_hs)
        self.h_0 = h_0  
        return  output
    
    def __call__(self, input, hidden):
        return self.forward(input, hidden)
    
    def evaluate(self, encoder_outputs,max_length):
        hidden = encoder_outputs[-1]
        hidden = hidden.reshape(1,hidden.shape[0],hidden.shape[1])
        input_T = self.encoder_outputs.shape[0]
        encoder_outputs = np.pad( self.encoder_outputs,((0,self.max_length- input_T),(0,0),(0,0)), 'constant')
        
        # input:(1, batch_size=1, input_size)  
        input = np.array([SOS_token])
        decoded_words = []
        for t in range(max_length):
            output,hidden,_ = self.forward_step(input,hidden,encoder_outputs,False)
            output = np.argmax(output)         
            if output==EOS_token:
                break;
            else:           
                decoded_words.append(output)               
                input = np.array([output])
        return    decoded_words      
    
    def backward(self,dZs):
        #input = np.concatenate(self.input,axis=0)
        input_T = self.encoder_outputs.shape[0]
        d_encoder_outputs  = np.zeros_like(self.encoder_outputs)
        T = len(dZs)
        encoder_outputs = np.pad( self.encoder_outputs,((0,self.max_length- input_T),(0,0),(0,0)), 'constant')
        
        for i in reversed(range(T)):
            self.out.x = self.out_x[i]
            dh = self.out.backward(dZs[i])            
            
            dhs = np.expand_dims(dh, axis=0)
            self.gru.hs = self.gru_hs[i]
            self.gru.zs = self.gru_zs[i]
            drelu_out,dprev_hidden = self.gru.backward(dhs,self.gru_x[i])           
            drelu_out = drelu_out.reshape(drelu_out.shape[1],drelu_out.shape[2])         
            
            self.relu.x = self.relu_x[i]
            d_relu_x = self.relu.backward(drelu_out)
            d_attn_combine_out = d_relu_x
            
            self.attn_combine.x = self.attn_combine_x[i]        
            d_attn_combine_x = self.attn_combine.backward(d_attn_combine_out)
            d_embedded, d_attn_applied = d_attn_combine_x[:,:self.hidden_size], d_attn_combine_x[:,self.hidden_size:]
            
            attn_weights = self.attn_weights_seq[i]
            d_attn_weights,d_encoder_outputs_t = bmm_backward(d_attn_applied,attn_weights,encoder_outputs)
        
            d_attn_out  = softmax_backward_2(attn_weights,d_attn_weights)
            
            self.attn.x = self.attn_x[i]
            d_attn_x = self.attn.backward(d_attn_out)
            
            d_embedded_2, dprev_hidden_2 = d_attn_x[:,:self.hidden_size], d_attn_x[:,self.hidden_size:]
            
            d_embedded+= d_embedded_2
            
            if self.use_dropout:
                self.dropout._mask = self.dropout_mask[i]
                d_embedding = self.dropout.backward(d_embedded)                  
            else:
                d_embedding = d_embedded             
            
            self.embedding.x = self.embedded_x[i]  ## recover the original x when do forward
            self.embedding.backward(d_embedding)
            
            dprev_hidden+= dprev_hidden_2            
            d_encoder_outputs +=d_encoder_outputs_t[:input_T] #每个时刻的都要累加        
        
        d_encoder_outputs[input_T-1]+=dprev_hidden[0]       
        return dprev_hidden,d_encoder_outputs #dhidden 
  
    def parameters(self):
        if self._params is None:
            self._params = []
            for layer in self.layers:
                for  i, _ in enumerate(layer.params):  
                    self._params.append([layer.params[i],layer.grads[i]])  
        return self._params     

训练模型

单步训练

def train_step(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, \
                loss_fn,reg,last_hidden = True,max_length=0):
    clip = 5.
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.shape[0] #input_tensor.size(0)
    
    loss = 0
    encode_input = input_tensor    
    encoder_output, encoder_hidden = encoder(encode_input, None)  
    if last_hidden:
        output = decoder(target_tensor, encoder_hidden)   
    else:
        output = decoder(target_tensor, encoder_output)
    
    target = target_tensor.reshape(-1,1)
    if output.shape[0]!= target.shape[0]:       
        target = target[:output.shape[0],:]      
    loss,grad = loss_fn(output, target)    
    loss /=(output.shape[0])
    
    if last_hidden:
        dinput,dhidden = decoder.backward(grad)
        encoder.backward(dhidden[0]) #,encode_input)        
    else:
        dinput,d_encoder_outputs = decoder.backward(grad)        
        encoder.backward(d_encoder_outputs)
  
    if reg is not None:
        loss+=encoder_optimizer.regularization(reg)
        loss+=decoder_optimizer.regularization(reg)

    util.clip_grad_norm_nn(encoder_optimizer.parameters(),clip,None)
    util.clip_grad_norm_nn(decoder_optimizer.parameters(),clip,None)
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss   
    #return loss.item() / target_length

迭代训练

import numpy as np
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline

def timeSince(start):
    now = time.time()
    s = now - start  
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def trainIters(encoder, decoder, encoder_optimizer,decoder_optimizer,train_pairs,valid_pairs, encoder_output_all = False,print_every=1000, plot_every=100, reg =None):
    start = time.time()
    valid_losses = []
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every    

    training_pairs = train_pairs
    loss_fn =  util.rnn_loss_grad
    
    for iter in range(1, n_iters + 1):        
        pair = training_pairs[iter - 1]
        input_tensor,target_tensor = pair[0],pair[1]     
       
        loss = train_step(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, loss_fn,reg,encoder_output_all)
        
        if loss is None: continue       
        print_loss_total += loss
        plot_loss_total += loss
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0            
            plt.plot(plot_losses)            
            valid_losses.append(validation_loss(encoder, decoder, valid_pairs,encoder_output_all,20,reg))
            plt.plot(valid_losses)           
            plt.legend(["train_losses","valid_losses"])
            plt.show()

def validation_loss(encoder, decoder, valid_pairs,last_hidden = True,validation_size = None,reg =None):    
    if validation_size is not None:
        valid_pairs = [random.choice(valid_pairs) for i in range(validation_size)]
    total_loss = 0
    loss_fn =  util.rnn_loss_grad    
    teacher_forcing_ratio = decoder.teacher_forcing_ratio 
    decoder.teacher_forcing_ratio = 1.1
    for pair in valid_pairs:
        encode_input = pair[0]   
        target_tensor = pair[1]   
        
        encoder_output, encoder_hidden = encoder(encode_input, None) 
        if last_hidden:
            output = decoder(target_tensor, encoder_hidden) 
           
        else:
            output = decoder(target_tensor, encoder_output) 
      
        target = target_tensor.reshape(-1,1)
        if output.shape[0]!= target.shape[0]:
            target = target[:output.shape[0],:]        
        loss,grad = loss_fn(output, target)    
        loss /=(output.shape[0])
       
        if reg is not None:
            params = encoder.parameters()+decoder.parameters()
            reg_loss =0
            for p,grad in params:            
                 reg_loss+= np.sum(p**2)
            loss += reg*reg_loss
    
        total_loss += loss
        
    decoder.teacher_forcing_ratio = teacher_forcing_ratio 
    return total_loss/len(valid_pairs)

单词表:

import numpy as np
from collections import defaultdict
SOS_token = 0
EOS_token = 1
UNK_token = 2
    
class Vocab:
    def __init__(self,min_count=1,corpus = None): 
        self.min_count = 1
        self.word2count = {}
        self.word2index = {"SOS":0,"EOS":1, "UNK":2}        
        self.index2word = {0: "SOS", 1: "EOS",2: "UNK"}
        self.n_words = 3  # Count SOS and EOS
        if corpus is not None:
            for sentence in corpus:
                self.addSentence(sentence)
            self.build()               

    def addSentence(self, sentence):
        if isinstance(sentence,str):
            for word in sentence.split(' '):
                self.addWord(word)                
        else:
            for word in sentence:
                self.addWord(word)              

    def addWord(self, word):
        if word not in self.word2count:
            self.word2count[word] = 1
        else:
            self.word2count[word] += 1
            
    def build(self):
        for word in self.word2count:
            if self.word2count[word]<self.min_count:
                self.word2index[word] = UNK_token
            else:
                self.word2index[word] = self.n_words 
                self.index2word[self.n_words] = word
                self.n_words += 1
                
            
vocab = Vocab()
vocab.addSentence("i am from china")
vocab.build()

print(vocab.word2index["i"])
print(vocab.index2word[4])

构建机器翻译的单词表

in_vocab = Vocab()
out_vocab = Vocab()
lang2lang_file = './data/eng-fra.txt'
pairs = read_pairs(lang2lang_file,True)
for pair in pairs: 
    in_vocab.addSentence(pair[0])
    out_vocab.addSentence(pair[1])
        
in_vocab.build()
out_vocab.build()        

def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(vocab, sentence):
    indexes = indexesFromSentence(vocab, sentence)
    indexes.append(EOS_token)
    return np.array(indexes).reshape(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(in_vocab, pair[0])
    target_tensor = tensorFromSentence(out_vocab, pair[1])
    return (input_tensor, target_tensor)

def indexToSentence(vocab, indexes):
    sentense = [vocab.index2word[idx] for idx in indexes]    
    return ' '.join(sentense)

input_tensor, target_tensor = tensorsFromPair(random.choice(pairs))
print(input_tensor.shape)
print(input_tensor)
print(target_tensor) 

开始训练

from train import *
from Layers import *
from rnn import *
import util

hidden_size = 256 
num_layers = 1

clip = 5.#50.
learning_rate = 0.03
decoder_learning_ratio = 1.0 
teacher_forcing_ratio =0.5

output_size = out_vocab.n_words  #num of words

encoder = EncoderRNN_Embed(in_vocab.n_words, hidden_size)
decoder = DecoderRNN_Atten(hidden_size,out_vocab.n_words,num_layers,teacher_forcing_ratio)

momentum = 0.3
decay_every  =1000
encoder_optimizer = SGD(encoder.parameters(), learning_rate, momentum,decay_every)
decoder_optimizer = SGD(decoder.parameters(), learning_rate*decoder_learning_ratio, momentum,decay_every)

reg= None#1e-2

np.random.shuffle(pairs)
train_n = (int)(len(pairs)*0.98)
train_pairs = pairs[:train_n]
valid_pairs = pairs[train_n:]   
    
print_every, plot_every = 100,100  #10,10

n_iters = 40000#40000
idx_train_pairs = [tensorsFromPair(random.choice(train_pairs))      for i in range(n_iters)]
idx_valid_pairs  =  [tensorsFromPair(pair)  for pair in valid_pairs]

trainIters(encoder, decoder,encoder_optimizer,decoder_optimizer,idx_train_pairs,idx_valid_pairs,False,print_every, plot_every,reg)

print("train finished!")

评估

评估函数

def evaluate(encoder,decoder,in_vocab,out_vocab,sentence,\
             max_length=MAX_LENGTH,last_Hidden = True): 
    encode_input = tensorFromSentence(in_vocab,sentence)
    encoder_output, encoder_hidden = encoder(encode_input, None)
    #print(encoder_output)
    if last_Hidden:
        output_sentence =  decoder.evaluate(encoder_hidden,max_length) 
    else:
        output_sentence =  decoder.evaluate(encoder_output,max_length) 
    output_sentence = indexToSentence(out_vocab,output_sentence)
    return output_sentence

开始评估

indices = np.random.randint(len(train_pairs), size=3)
for i in indices:
    pair = pairs[i]
    print(pair)
    sentence = pair[0]   
    sentence = evaluate(encoder, decoder,in_vocab,out_vocab, sentence,MAX_LENGTH,False)
    print(sentence)

上面的解码器简化了一下:

from Layers import *
from rnn import *
import util

MAX_LENGTH = 10

class  DecoderRNN_Atten_py(object):
    def __init__(self, hidden_size, output_size,num_layers=1,teacher_forcing_ratio = 0.5,dropout_p=0.1, \
                 max_length=MAX_LENGTH):
        super(DecoderRNN_Atten_py, self).__init__()
       
        self.hidden_size = hidden_size       
        self.num_layers = 1
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = Embedding(output_size, hidden_size)
        self.dropout = Dropout(self.dropout_p)
                
        self.attn = Dense(self.hidden_size * 2, self.max_length)
        #self.attn = Atten(hidden_size)    
        self.attn_combine = Dense(self.hidden_size * 2, self.hidden_size)        
        self.relu = Relu()
        
        self.gru = GRU(hidden_size, hidden_size,1)
        self.out = Dense(hidden_size, output_size)
        
        self.layers = [self.embedding,self.attn,self.attn_combine,self.gru,self.out]
        self._params = None
        self.use_dropout = False

    def initHidden(self,batch_size):
        # This is what we'll initialise our hidden state as
        self.h_0 =  np.zeros((self.num_layers, batch_size, self.hidden_size))    
    
    def forward_step(self, input, prev_hidden,encoder_outputs,training=True):
        embedded = self.embedding(input)  #(B,D))
        self.embedded_x.append(self.embedding.x) 
    # def forward_step(self, embedded, prev_hidden,encoder_outputs,train = True): 
     #   self.encoder_outputs = encoder_outputs
      #  self.init_temp_varialbe()
        
        if self.use_dropout and training:
            embedded = self.dropout(embedded,training)
            self.dropout_mask.append(self.dropout._mask)        
             
       
        attn_out = self.attn(np.concatenate((embedded, prev_hidden[0]),axis=1))
        self.attn_x.append(self.attn.x)
        #context,alphas,energies = self.attn(hidden, encoder_outputs)  
        #self.attn_x.append((context,alphas,energies,hidden))               
        
        attn_weights  = util.softmax(attn_out)        
        self.attn_weights_seq.append(attn_weights)
        
        print(attn_weights.shape,encoder_outputs_.shape)
        
        attn_applied  = bmm(attn_weights,encoder_outputs)
        print("attn_applied",attn_applied)
        
        print("embedded.shape,attn_applied.shape")
        print(embedded.shape,attn_applied.shape)
        attn_combine_out = self.attn_combine(np.concatenate((embedded, attn_applied),axis=1))
        self.attn_combine_x.append(self.attn_combine.x)
        print("attn_combine_out",attn_combine_out)
            
        output = self.relu(attn_combine_out)   
        self.relu_x.append(self.relu.x)  
        
        relu_out = output.reshape(1,output.shape[0],-1)   
        print("relu_out.shape,attn_applied.shape")
        print(relu_out.shape,prev_hidden.shape)
        
        self.gru_x.append(relu_out)  #output)  # input of gru          
        output_hs, hidden = self.gru(relu_out,prev_hidden) 
        self.gru_hs.append(self.gru.hs) #保持中间层的隐状态
        self.gru_zs.append(self.gru.zs) #保持中间层的计算结果 
        
        print("gru output",output_hs)
            
        output_hs_t = output_hs[0]#seq_len = 1
        output = self.out(output_hs_t) 
        self.out_x.append(self.out.x)      
       
        print("out",output)
        
        self.output_hs = np.array(output_hs)
        self.h_0 = hidden.copy()        
        self.hs = np.concatenate(self.gru_hs, axis=1)
        self.zs = np.concatenate(self.gru_zs, axis=1)
        
        return output,hidden,output_hs_t
    
    def init_temp_varialbe(self):
        self.embedded_x = []
        self.dropout_mask = []
        self.attn_x = []
        self.attn_weights_seq = [] 
        self.attn_combine_x = [] 
        self.relu_x = []
        self.gru_x = []
        self.gru_hs = []
        self.gru_zs = []
        self.out_x = []       
      
        
    def forward(self,input_tensor,encoder_outputs): #hidden,encoder_outputs): 
        self.init_temp_varialbe()
        self.encoder_outputs = encoder_outputs  #(T,B,D)  原始的内容向量     
        target_length = input_tensor.shape[0] #nput_tensor.size(0)
        
        teacher_forcing_ratio = self.teacher_forcing_ratio
        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        
        hidden_t = encoder_outputs[-1].reshape(1,encoder_outputs[-1].shape[0],encoder_outputs[-1].shape[1])
        h_0 = hidden_t.copy()
        input_t = np.array([SOS_token])
           
        output_hs = []
        output = []  
       
        encoder_outputs = np.pad(self.encoder_outputs,((0,self.max_length-self.encoder_outputs.shape[0]),(0,0),(0,0)), 'constant')
        for t in range(target_length):
            output_t, hidden_t,output_hs_t = self.forward_step(input_t, hidden_t,encoder_outputs)
            output_hs.append(output_hs_t)
            output.append(output_t)
           
            if use_teacher_forcing:
                input_t = input_tensor[t]  # Teacher forcing
            else:             
                input_t = np.argmax(output_t)  #最大概率
                if input_t== EOS_token:
                    break  
                input_t = np.array([input_t])
                
        output = np.array(output)
        self.output_hs = np.array(output_hs)
        self.h_0 = h_0  
        return  output
    
    def __call__(self, input, hidden):
        return self.forward(input, hidden)
    
    def evaluate(self, encoder_outputs,max_length):
        hidden = encoder_outputs[-1]
        hidden = hidden.reshape(1,hidden.shape[0],hidden.shape[1])
        input_T = self.encoder_outputs.shape[0]
        encoder_outputs = np.pad( self.encoder_outputs,((0,self.max_length- input_T),(0,0),(0,0)), 'constant')
        
        # input:(1, batch_size=1, input_size)  
        input = np.array([SOS_token])
        decoded_words = []
        for t in range(max_length):
            output,hidden,_ = self.forward_step(input,hidden,encoder_outputs,False)
            output = np.argmax(output)         
            if output==EOS_token:
                break;
            else:           
                decoded_words.append(output)               
                input = np.array([output])
        return    decoded_words      
    
    def backward(self,dZs):
        #input = np.concatenate(self.input,axis=0)
        input_T = self.encoder_outputs.shape[0]
        d_encoder_outputs  = np.zeros_like(self.encoder_outputs)
        T = len(dZs)
        encoder_outputs = np.pad( self.encoder_outputs,((0,self.max_length- input_T),(0,0),(0,0)), 'constant')
        
        for i in reversed(range(T)):
            self.out.x = self.out_x[i]
            dh = self.out.backward(dZs[i])            
            
            dhs = np.expand_dims(dh, axis=0)
            self.gru.hs = self.gru_hs[i]
            self.gru.zs = self.gru_zs[i]
            drelu_out,dprev_hidden = self.gru.backward(dhs,self.gru_x[i])           
            drelu_out = drelu_out.reshape(drelu_out.shape[1],drelu_out.shape[2])         
            
            self.relu.x = self.relu_x[i]
            d_relu_x = self.relu.backward(drelu_out)
            d_attn_combine_out = d_relu_x
            
            self.attn_combine.x = self.attn_combine_x[i]        
            d_attn_combine_x = self.attn_combine.backward(d_attn_combine_out)
            d_embedded, d_attn_applied = d_attn_combine_x[:,:self.hidden_size], d_attn_combine_x[:,self.hidden_size:]
            
            attn_weights = self.attn_weights_seq[i]
            d_attn_weights,d_encoder_outputs_t = bmm_backward(d_attn_applied,attn_weights,encoder_outputs)
        
            d_attn_out  = softmax_backward_2(attn_weights,d_attn_weights)
            
            self.attn.x = self.attn_x[i]
            d_attn_x = self.attn.backward(d_attn_out)
            
            d_embedded_2, dprev_hidden_2 = d_attn_x[:,:self.hidden_size], d_attn_x[:,self.hidden_size:]
            
            d_embedded+= d_embedded_2
            
            if self.use_dropout:
                self.dropout._mask = self.dropout_mask[i]
                d_embedding = self.dropout.backward(d_embedded)                  
            else:
                d_embedding = d_embedded             
            
            if False:
                self.embedding.x = self.embedded_x[i]  ## recover the original x when do forward
                self.embedding.backward(d_embedding)
            
            dprev_hidden+= dprev_hidden_2            
            d_encoder_outputs +=d_encoder_outputs_t[:input_T] #每个时刻的都要累加        
        
        d_encoder_outputs[input_T-1]+=dprev_hidden[0]       
        return dprev_hidden,d_encoder_outputs[:input_T] #dhidden 
  
    def parameters(self):
        if self._params is None:
            self._params = []
            for layer in self.layers:
                for  i, _ in enumerate(layer.params):  
                    self._params.append([layer.params[i],layer.grads[i]])  
        return self._params   

支付宝打赏 微信打赏

您的打赏是对我最大的鼓励!