这是我书中的本来的代码,但写了另外一个注意力机制代码,这段代码要删除,这里保存一下,供大家参考。注意:解码器代码中有一个重大bug,我不透露,谁能发现?
希望和搞懂原理,请看我的著作《解剖深度学习原理-从0编写深度学习库》。没有比我这本书更适合小白学习的深度学习书了,那些大师大牛们的书,普通人看起来费劲,他们总是以为读者已经是内行了。其实很多人都是小白。而且他们喜欢将简单的东西复杂化,故弄玄虚。我这书如同庖丁解牛,详细破解,让人一下子豁然开朗,原来深度学习竟然如此简单! 就是这么简单! 我这么手把手分解给读者,这样的内容,你其他书上找不到或者说这样的书极少(除一个日本人和一个美国人写的,但他们的书写的很不全,只是开头,不全面)。这本书出版社因为疫情,出版社从5月份一直推迟到了9月份。期间我没事看看,还发现了一些问题,又修改反馈给出版社。
另外原著要1000页,后来出版社说要拆分成2本出版,即pytorch和强化学习部分被删除了,将以另外一本电子书形式出版(纸质书不赚钱,一般书也就是5000元左右,性价比太低太低)。谁告诉我哪个平台可以出版电子书呢?
基于嵌入层的编码器
from rnn import *
from Layers import *
from train import *
class EncoderRNN_Embed(object):
def __init__(self, input_size, hidden_size):
super().__init__()
self.input_size,self.hidden_size = input_size,hidden_size
#print(input_size, hidden_size)
self.embedding = Embedding(input_size, hidden_size)
self.gru = GRU(hidden_size, hidden_size,1)
def forward(self, input, hidden):
#(1,1,input_size)
#input = vecterization(input_verb,input)
#output, hidden = self.gru(input, hidden)
self.embedded_x = []
self.embedded_out = []
embed_out = []
for x in input:
embedded = self.embedding(x).reshape(1,1,-1)
self.embedded_x.append(self.embedding.x)
self.embedded_out.append( embedded)
self.embedded_out = np.concatenate(self.embedded_out,axis=0)
output, hidden = self.gru(self.embedded_out, hidden)
return output, hidden
def __call__(self,input, hidden):
return self.forward(input, hidden)
def initHidden(self):
return np.zeros((1, 1, self.hidden_size))
def parameters(self):
return self.gru.parameters()
def backward(self,dhs):
dinput,dhidden = self.gru.backward(dhs,self.embedded_out)
T = dinput.shape[0]
for t in range(T):
dinput_t = dinput[t]
self.embedding.x = self.embedded_x[t] ## recover the original x when do forward
#self.embedding.backward(d_embeded)
self.embedding.backward(dinput_t)
基于注意力机制的解码器
按照pytorch教程的图从0编写的编码器
from Layers import *
from rnn import *
import util
class DecoderRNN_Atten(object):
def __init__(self, hidden_size, output_size,num_layers=1,teacher_forcing_ratio = 0.5,dropout_p=0.1, \
max_length=MAX_LENGTH):
super(DecoderRNN_Atten, self).__init__()
self.hidden_size = hidden_size
self.num_layers = 1
self.teacher_forcing_ratio = teacher_forcing_ratio
self.dropout_p = dropout_p
self.max_length = max_length
self.embedding = Embedding(output_size, hidden_size)
self.dropout = Dropout(self.dropout_p)
self.attn = Dense(self.hidden_size * 2, self.max_length)
#self.attn = Atten(hidden_size)
self.attn_combine = Dense(self.hidden_size * 2, self.hidden_size)
self.relu = Relu()
self.gru = GRU(hidden_size, hidden_size,1)
self.out = Dense(hidden_size, output_size)
self.layers = [self.embedding,self.attn,self.attn_combine,self.gru,self.out]
self._params = None
self.use_dropout = False
def initHidden(self,batch_size):
# This is what we'll initialise our hidden state as
self.h_0 = np.zeros((self.num_layers, batch_size, self.hidden_size))
def forward_step(self, input, prev_hidden,encoder_outputs,training=True):
embedded = self.embedding(input) #(B,D))
self.embedded_x.append(self.embedding.x)
if self.use_dropout and training:
embedded = self.dropout(embedded,training)
self.dropout_mask.append(self.dropout._mask)
attn_out = self.attn(np.concatenate((embedded, prev_hidden[0]),axis=1))
self.attn_x.append(self.attn.x)
#context,alphas,energies = self.attn(hidden, encoder_outputs)
#self.attn_x.append((context,alphas,energies,hidden))
attn_weights = util.softmax(attn_out)
self.attn_weights_seq.append(attn_weights)
attn_applied = bmm(attn_weights,encoder_outputs)
attn_combine_out = self.attn_combine(np.concatenate((embedded, attn_applied),axis=1))
self.attn_combine_x.append(self.attn_combine.x)
output = self.relu(attn_combine_out)
self.relu_x.append(self.relu.x)
relu_out = output.reshape(1,output.shape[0],-1)
self.gru_x.append(relu_out) #output) # input of gru
output_hs, hidden = self.gru(relu_out,prev_hidden)
self.gru_hs.append(self.gru.hs) #保持中间层的隐状态
self.gru_zs.append(self.gru.zs) #保持中间层的计算结果
output_hs_t = output_hs[0]#seq_len = 1
output = self.out(output_hs_t)
self.out_x.append(self.out.x)
return output,hidden,output_hs_t
def forward(self,input_tensor,encoder_outputs): #hidden,encoder_outputs):
self.encoder_outputs = encoder_outputs #(T,B,D)
self.attn_weights_seq = []
target_length = input_tensor.shape[0] #nput_tensor.size(0)
teacher_forcing_ratio = self.teacher_forcing_ratio
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
hidden_t = encoder_outputs[-1].reshape(1,encoder_outputs[-1].shape[0],encoder_outputs[-1].shape[1])
h_0 = hidden_t.copy()
input_t = np.array([SOS_token])
output_hs = []
output = []
self.gru_x = [] #gru input
self.gru_hs = []
self.gru_zs = []
self.dropout_mask = []
self.embedded_x = []
self.relu_x = []
self.attn_x = []
self.attn_combine_x = []
self.attn_weights_seq = []
self.out_x = []
encoder_outputs = np.pad(self.encoder_outputs,((0,self.max_length-self.encoder_outputs.shape[0]),(0,0),(0,0)), 'constant')
for t in range(target_length):
output_t, hidden_t,output_hs_t = self.forward_step(input_t, hidden_t,encoder_outputs)
output_hs.append(output_hs_t)
output.append(output_t)
if use_teacher_forcing:
input_t = input_tensor[t] # Teacher forcing
else:
input_t = np.argmax(output_t) #最大概率
if input_t== EOS_token:
break
input_t = np.array([input_t])
output = np.array(output)
self.output_hs = np.array(output_hs)
self.h_0 = h_0
return output
def __call__(self, input, hidden):
return self.forward(input, hidden)
def evaluate(self, encoder_outputs,max_length):
hidden = encoder_outputs[-1]
hidden = hidden.reshape(1,hidden.shape[0],hidden.shape[1])
input_T = self.encoder_outputs.shape[0]
encoder_outputs = np.pad( self.encoder_outputs,((0,self.max_length- input_T),(0,0),(0,0)), 'constant')
# input:(1, batch_size=1, input_size)
input = np.array([SOS_token])
decoded_words = []
for t in range(max_length):
output,hidden,_ = self.forward_step(input,hidden,encoder_outputs,False)
output = np.argmax(output)
if output==EOS_token:
break;
else:
decoded_words.append(output)
input = np.array([output])
return decoded_words
def backward(self,dZs):
#input = np.concatenate(self.input,axis=0)
input_T = self.encoder_outputs.shape[0]
d_encoder_outputs = np.zeros_like(self.encoder_outputs)
T = len(dZs)
encoder_outputs = np.pad( self.encoder_outputs,((0,self.max_length- input_T),(0,0),(0,0)), 'constant')
for i in reversed(range(T)):
self.out.x = self.out_x[i]
dh = self.out.backward(dZs[i])
dhs = np.expand_dims(dh, axis=0)
self.gru.hs = self.gru_hs[i]
self.gru.zs = self.gru_zs[i]
drelu_out,dprev_hidden = self.gru.backward(dhs,self.gru_x[i])
drelu_out = drelu_out.reshape(drelu_out.shape[1],drelu_out.shape[2])
self.relu.x = self.relu_x[i]
d_relu_x = self.relu.backward(drelu_out)
d_attn_combine_out = d_relu_x
self.attn_combine.x = self.attn_combine_x[i]
d_attn_combine_x = self.attn_combine.backward(d_attn_combine_out)
d_embedded, d_attn_applied = d_attn_combine_x[:,:self.hidden_size], d_attn_combine_x[:,self.hidden_size:]
attn_weights = self.attn_weights_seq[i]
d_attn_weights,d_encoder_outputs_t = bmm_backward(d_attn_applied,attn_weights,encoder_outputs)
d_attn_out = softmax_backward_2(attn_weights,d_attn_weights)
self.attn.x = self.attn_x[i]
d_attn_x = self.attn.backward(d_attn_out)
d_embedded_2, dprev_hidden_2 = d_attn_x[:,:self.hidden_size], d_attn_x[:,self.hidden_size:]
d_embedded+= d_embedded_2
if self.use_dropout:
self.dropout._mask = self.dropout_mask[i]
d_embedding = self.dropout.backward(d_embedded)
else:
d_embedding = d_embedded
self.embedding.x = self.embedded_x[i] ## recover the original x when do forward
self.embedding.backward(d_embedding)
dprev_hidden+= dprev_hidden_2
d_encoder_outputs +=d_encoder_outputs_t[:input_T] #每个时刻的都要累加
d_encoder_outputs[input_T-1]+=dprev_hidden[0]
return dprev_hidden,d_encoder_outputs #dhidden
def parameters(self):
if self._params is None:
self._params = []
for layer in self.layers:
for i, _ in enumerate(layer.params):
self._params.append([layer.params[i],layer.grads[i]])
return self._params
训练模型
单步训练
def train_step(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, \
loss_fn,reg,last_hidden = True,max_length=0):
clip = 5.
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
input_length = input_tensor.shape[0] #input_tensor.size(0)
loss = 0
encode_input = input_tensor
encoder_output, encoder_hidden = encoder(encode_input, None)
if last_hidden:
output = decoder(target_tensor, encoder_hidden)
else:
output = decoder(target_tensor, encoder_output)
target = target_tensor.reshape(-1,1)
if output.shape[0]!= target.shape[0]:
target = target[:output.shape[0],:]
loss,grad = loss_fn(output, target)
loss /=(output.shape[0])
if last_hidden:
dinput,dhidden = decoder.backward(grad)
encoder.backward(dhidden[0]) #,encode_input)
else:
dinput,d_encoder_outputs = decoder.backward(grad)
encoder.backward(d_encoder_outputs)
if reg is not None:
loss+=encoder_optimizer.regularization(reg)
loss+=decoder_optimizer.regularization(reg)
util.clip_grad_norm_nn(encoder_optimizer.parameters(),clip,None)
util.clip_grad_norm_nn(decoder_optimizer.parameters(),clip,None)
encoder_optimizer.step()
decoder_optimizer.step()
return loss
#return loss.item() / target_length
迭代训练
import numpy as np
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline
def timeSince(start):
now = time.time()
s = now - start
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
def trainIters(encoder, decoder, encoder_optimizer,decoder_optimizer,train_pairs,valid_pairs, encoder_output_all = False,print_every=1000, plot_every=100, reg =None):
start = time.time()
valid_losses = []
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every
training_pairs = train_pairs
loss_fn = util.rnn_loss_grad
for iter in range(1, n_iters + 1):
pair = training_pairs[iter - 1]
input_tensor,target_tensor = pair[0],pair[1]
loss = train_step(input_tensor, target_tensor, encoder,
decoder, encoder_optimizer, decoder_optimizer, loss_fn,reg,encoder_output_all)
if loss is None: continue
print_loss_total += loss
plot_loss_total += loss
if iter % print_every == 0:
print_loss_avg = print_loss_total / print_every
print_loss_total = 0
print('%s (%d %d%%) %.4f' % (timeSince(start),
iter, iter / n_iters * 100, print_loss_avg))
if iter % plot_every == 0:
plot_loss_avg = plot_loss_total / plot_every
plot_losses.append(plot_loss_avg)
plot_loss_total = 0
plt.plot(plot_losses)
valid_losses.append(validation_loss(encoder, decoder, valid_pairs,encoder_output_all,20,reg))
plt.plot(valid_losses)
plt.legend(["train_losses","valid_losses"])
plt.show()
def validation_loss(encoder, decoder, valid_pairs,last_hidden = True,validation_size = None,reg =None):
if validation_size is not None:
valid_pairs = [random.choice(valid_pairs) for i in range(validation_size)]
total_loss = 0
loss_fn = util.rnn_loss_grad
teacher_forcing_ratio = decoder.teacher_forcing_ratio
decoder.teacher_forcing_ratio = 1.1
for pair in valid_pairs:
encode_input = pair[0]
target_tensor = pair[1]
encoder_output, encoder_hidden = encoder(encode_input, None)
if last_hidden:
output = decoder(target_tensor, encoder_hidden)
else:
output = decoder(target_tensor, encoder_output)
target = target_tensor.reshape(-1,1)
if output.shape[0]!= target.shape[0]:
target = target[:output.shape[0],:]
loss,grad = loss_fn(output, target)
loss /=(output.shape[0])
if reg is not None:
params = encoder.parameters()+decoder.parameters()
reg_loss =0
for p,grad in params:
reg_loss+= np.sum(p**2)
loss += reg*reg_loss
total_loss += loss
decoder.teacher_forcing_ratio = teacher_forcing_ratio
return total_loss/len(valid_pairs)
单词表:
import numpy as np
from collections import defaultdict
SOS_token = 0
EOS_token = 1
UNK_token = 2
class Vocab:
def __init__(self,min_count=1,corpus = None):
self.min_count = 1
self.word2count = {}
self.word2index = {"SOS":0,"EOS":1, "UNK":2}
self.index2word = {0: "SOS", 1: "EOS",2: "UNK"}
self.n_words = 3 # Count SOS and EOS
if corpus is not None:
for sentence in corpus:
self.addSentence(sentence)
self.build()
def addSentence(self, sentence):
if isinstance(sentence,str):
for word in sentence.split(' '):
self.addWord(word)
else:
for word in sentence:
self.addWord(word)
def addWord(self, word):
if word not in self.word2count:
self.word2count[word] = 1
else:
self.word2count[word] += 1
def build(self):
for word in self.word2count:
if self.word2count[word]<self.min_count:
self.word2index[word] = UNK_token
else:
self.word2index[word] = self.n_words
self.index2word[self.n_words] = word
self.n_words += 1
vocab = Vocab()
vocab.addSentence("i am from china")
vocab.build()
print(vocab.word2index["i"])
print(vocab.index2word[4])
构建机器翻译的单词表
in_vocab = Vocab()
out_vocab = Vocab()
lang2lang_file = './data/eng-fra.txt'
pairs = read_pairs(lang2lang_file,True)
for pair in pairs:
in_vocab.addSentence(pair[0])
out_vocab.addSentence(pair[1])
in_vocab.build()
out_vocab.build()
def indexesFromSentence(vocab, sentence):
return [vocab.word2index[word] for word in sentence.split(' ')]
def tensorFromSentence(vocab, sentence):
indexes = indexesFromSentence(vocab, sentence)
indexes.append(EOS_token)
return np.array(indexes).reshape(-1, 1)
def tensorsFromPair(pair):
input_tensor = tensorFromSentence(in_vocab, pair[0])
target_tensor = tensorFromSentence(out_vocab, pair[1])
return (input_tensor, target_tensor)
def indexToSentence(vocab, indexes):
sentense = [vocab.index2word[idx] for idx in indexes]
return ' '.join(sentense)
input_tensor, target_tensor = tensorsFromPair(random.choice(pairs))
print(input_tensor.shape)
print(input_tensor)
print(target_tensor)
开始训练
from train import *
from Layers import *
from rnn import *
import util
hidden_size = 256
num_layers = 1
clip = 5.#50.
learning_rate = 0.03
decoder_learning_ratio = 1.0
teacher_forcing_ratio =0.5
output_size = out_vocab.n_words #num of words
encoder = EncoderRNN_Embed(in_vocab.n_words, hidden_size)
decoder = DecoderRNN_Atten(hidden_size,out_vocab.n_words,num_layers,teacher_forcing_ratio)
momentum = 0.3
decay_every =1000
encoder_optimizer = SGD(encoder.parameters(), learning_rate, momentum,decay_every)
decoder_optimizer = SGD(decoder.parameters(), learning_rate*decoder_learning_ratio, momentum,decay_every)
reg= None#1e-2
np.random.shuffle(pairs)
train_n = (int)(len(pairs)*0.98)
train_pairs = pairs[:train_n]
valid_pairs = pairs[train_n:]
print_every, plot_every = 100,100 #10,10
n_iters = 40000#40000
idx_train_pairs = [tensorsFromPair(random.choice(train_pairs)) for i in range(n_iters)]
idx_valid_pairs = [tensorsFromPair(pair) for pair in valid_pairs]
trainIters(encoder, decoder,encoder_optimizer,decoder_optimizer,idx_train_pairs,idx_valid_pairs,False,print_every, plot_every,reg)
print("train finished!")
评估
评估函数
def evaluate(encoder,decoder,in_vocab,out_vocab,sentence,\
max_length=MAX_LENGTH,last_Hidden = True):
encode_input = tensorFromSentence(in_vocab,sentence)
encoder_output, encoder_hidden = encoder(encode_input, None)
#print(encoder_output)
if last_Hidden:
output_sentence = decoder.evaluate(encoder_hidden,max_length)
else:
output_sentence = decoder.evaluate(encoder_output,max_length)
output_sentence = indexToSentence(out_vocab,output_sentence)
return output_sentence
开始评估
indices = np.random.randint(len(train_pairs), size=3)
for i in indices:
pair = pairs[i]
print(pair)
sentence = pair[0]
sentence = evaluate(encoder, decoder,in_vocab,out_vocab, sentence,MAX_LENGTH,False)
print(sentence)
上面的解码器简化了一下:
from Layers import *
from rnn import *
import util
MAX_LENGTH = 10
class DecoderRNN_Atten_py(object):
def __init__(self, hidden_size, output_size,num_layers=1,teacher_forcing_ratio = 0.5,dropout_p=0.1, \
max_length=MAX_LENGTH):
super(DecoderRNN_Atten_py, self).__init__()
self.hidden_size = hidden_size
self.num_layers = 1
self.teacher_forcing_ratio = teacher_forcing_ratio
self.dropout_p = dropout_p
self.max_length = max_length
self.embedding = Embedding(output_size, hidden_size)
self.dropout = Dropout(self.dropout_p)
self.attn = Dense(self.hidden_size * 2, self.max_length)
#self.attn = Atten(hidden_size)
self.attn_combine = Dense(self.hidden_size * 2, self.hidden_size)
self.relu = Relu()
self.gru = GRU(hidden_size, hidden_size,1)
self.out = Dense(hidden_size, output_size)
self.layers = [self.embedding,self.attn,self.attn_combine,self.gru,self.out]
self._params = None
self.use_dropout = False
def initHidden(self,batch_size):
# This is what we'll initialise our hidden state as
self.h_0 = np.zeros((self.num_layers, batch_size, self.hidden_size))
def forward_step(self, input, prev_hidden,encoder_outputs,training=True):
embedded = self.embedding(input) #(B,D))
self.embedded_x.append(self.embedding.x)
# def forward_step(self, embedded, prev_hidden,encoder_outputs,train = True):
# self.encoder_outputs = encoder_outputs
# self.init_temp_varialbe()
if self.use_dropout and training:
embedded = self.dropout(embedded,training)
self.dropout_mask.append(self.dropout._mask)
attn_out = self.attn(np.concatenate((embedded, prev_hidden[0]),axis=1))
self.attn_x.append(self.attn.x)
#context,alphas,energies = self.attn(hidden, encoder_outputs)
#self.attn_x.append((context,alphas,energies,hidden))
attn_weights = util.softmax(attn_out)
self.attn_weights_seq.append(attn_weights)
print(attn_weights.shape,encoder_outputs_.shape)
attn_applied = bmm(attn_weights,encoder_outputs)
print("attn_applied",attn_applied)
print("embedded.shape,attn_applied.shape")
print(embedded.shape,attn_applied.shape)
attn_combine_out = self.attn_combine(np.concatenate((embedded, attn_applied),axis=1))
self.attn_combine_x.append(self.attn_combine.x)
print("attn_combine_out",attn_combine_out)
output = self.relu(attn_combine_out)
self.relu_x.append(self.relu.x)
relu_out = output.reshape(1,output.shape[0],-1)
print("relu_out.shape,attn_applied.shape")
print(relu_out.shape,prev_hidden.shape)
self.gru_x.append(relu_out) #output) # input of gru
output_hs, hidden = self.gru(relu_out,prev_hidden)
self.gru_hs.append(self.gru.hs) #保持中间层的隐状态
self.gru_zs.append(self.gru.zs) #保持中间层的计算结果
print("gru output",output_hs)
output_hs_t = output_hs[0]#seq_len = 1
output = self.out(output_hs_t)
self.out_x.append(self.out.x)
print("out",output)
self.output_hs = np.array(output_hs)
self.h_0 = hidden.copy()
self.hs = np.concatenate(self.gru_hs, axis=1)
self.zs = np.concatenate(self.gru_zs, axis=1)
return output,hidden,output_hs_t
def init_temp_varialbe(self):
self.embedded_x = []
self.dropout_mask = []
self.attn_x = []
self.attn_weights_seq = []
self.attn_combine_x = []
self.relu_x = []
self.gru_x = []
self.gru_hs = []
self.gru_zs = []
self.out_x = []
def forward(self,input_tensor,encoder_outputs): #hidden,encoder_outputs):
self.init_temp_varialbe()
self.encoder_outputs = encoder_outputs #(T,B,D) 原始的内容向量
target_length = input_tensor.shape[0] #nput_tensor.size(0)
teacher_forcing_ratio = self.teacher_forcing_ratio
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
hidden_t = encoder_outputs[-1].reshape(1,encoder_outputs[-1].shape[0],encoder_outputs[-1].shape[1])
h_0 = hidden_t.copy()
input_t = np.array([SOS_token])
output_hs = []
output = []
encoder_outputs = np.pad(self.encoder_outputs,((0,self.max_length-self.encoder_outputs.shape[0]),(0,0),(0,0)), 'constant')
for t in range(target_length):
output_t, hidden_t,output_hs_t = self.forward_step(input_t, hidden_t,encoder_outputs)
output_hs.append(output_hs_t)
output.append(output_t)
if use_teacher_forcing:
input_t = input_tensor[t] # Teacher forcing
else:
input_t = np.argmax(output_t) #最大概率
if input_t== EOS_token:
break
input_t = np.array([input_t])
output = np.array(output)
self.output_hs = np.array(output_hs)
self.h_0 = h_0
return output
def __call__(self, input, hidden):
return self.forward(input, hidden)
def evaluate(self, encoder_outputs,max_length):
hidden = encoder_outputs[-1]
hidden = hidden.reshape(1,hidden.shape[0],hidden.shape[1])
input_T = self.encoder_outputs.shape[0]
encoder_outputs = np.pad( self.encoder_outputs,((0,self.max_length- input_T),(0,0),(0,0)), 'constant')
# input:(1, batch_size=1, input_size)
input = np.array([SOS_token])
decoded_words = []
for t in range(max_length):
output,hidden,_ = self.forward_step(input,hidden,encoder_outputs,False)
output = np.argmax(output)
if output==EOS_token:
break;
else:
decoded_words.append(output)
input = np.array([output])
return decoded_words
def backward(self,dZs):
#input = np.concatenate(self.input,axis=0)
input_T = self.encoder_outputs.shape[0]
d_encoder_outputs = np.zeros_like(self.encoder_outputs)
T = len(dZs)
encoder_outputs = np.pad( self.encoder_outputs,((0,self.max_length- input_T),(0,0),(0,0)), 'constant')
for i in reversed(range(T)):
self.out.x = self.out_x[i]
dh = self.out.backward(dZs[i])
dhs = np.expand_dims(dh, axis=0)
self.gru.hs = self.gru_hs[i]
self.gru.zs = self.gru_zs[i]
drelu_out,dprev_hidden = self.gru.backward(dhs,self.gru_x[i])
drelu_out = drelu_out.reshape(drelu_out.shape[1],drelu_out.shape[2])
self.relu.x = self.relu_x[i]
d_relu_x = self.relu.backward(drelu_out)
d_attn_combine_out = d_relu_x
self.attn_combine.x = self.attn_combine_x[i]
d_attn_combine_x = self.attn_combine.backward(d_attn_combine_out)
d_embedded, d_attn_applied = d_attn_combine_x[:,:self.hidden_size], d_attn_combine_x[:,self.hidden_size:]
attn_weights = self.attn_weights_seq[i]
d_attn_weights,d_encoder_outputs_t = bmm_backward(d_attn_applied,attn_weights,encoder_outputs)
d_attn_out = softmax_backward_2(attn_weights,d_attn_weights)
self.attn.x = self.attn_x[i]
d_attn_x = self.attn.backward(d_attn_out)
d_embedded_2, dprev_hidden_2 = d_attn_x[:,:self.hidden_size], d_attn_x[:,self.hidden_size:]
d_embedded+= d_embedded_2
if self.use_dropout:
self.dropout._mask = self.dropout_mask[i]
d_embedding = self.dropout.backward(d_embedded)
else:
d_embedding = d_embedded
if False:
self.embedding.x = self.embedded_x[i] ## recover the original x when do forward
self.embedding.backward(d_embedding)
dprev_hidden+= dprev_hidden_2
d_encoder_outputs +=d_encoder_outputs_t[:input_T] #每个时刻的都要累加
d_encoder_outputs[input_T-1]+=dprev_hidden[0]
return dprev_hidden,d_encoder_outputs[:input_T] #dhidden
def parameters(self):
if self._params is None:
self._params = []
for layer in self.layers:
for i, _ in enumerate(layer.params):
self._params.append([layer.params[i],layer.grads[i]])
return self._params
您的打赏是对我最大的鼓励!