PyTorch에서 학습 모델에 라벨을 지정하는 시퀀스를 수행하고 있습니다. 나는 두 문장을 가지고 있는데, 그 문장이 수반되는지 아닌지를 분류하고있다 (SNLI 데이터 세트). 두 개의 50 단어 문장을 합쳐서 (때로는 패딩 된) 길이가 100 인 벡터로 연결합니다. 그런 다음 미니 바이트를 단어 임베딩 -> LSTM -> 선형 레이어로 보냅니다. 크로스 엔트로피 손실을하고 있지만 CrossEntropyLoss 함수로 들어가려면 [mini_batch, C] 벡터가 필요합니다. 출력PyTorch에서 치수 상실 문제 (레이블 학습에서 시퀀스)
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(mlp_d, 1024)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
#self.sm = nn.CrossEntropyLoss()
def display(self):
for param in self.parameters():
print(param.data.size())
def filter_params(self):
# Might not be compatible with python 3
#self.parameters = filter(lambda p: p.requires_grad, self.parameters())
pass
def init_hidden(self):
# Need to init hidden weights in LSTM
pass
def forward(self, sentence):
print(sentence.size())
embeds = self.embedding(sentence)
print(embeds.size())
out, _ = self.lstm(embeds)
print(out.size())
out = self.mlp(out)
return out
내 트레이닝 시퀀스 :
batch_size = 3
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01, weight_decay=1e-4)
ADM_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
criterion = nn.CrossEntropyLoss()
num_epochs = 50
from torch.autograd import Variable
from torch import optim
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, float(epoch)/num_epochs))
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
res = np.concatenate((s1,s2), axis=1) # Attach two sentences into 1 input vector
input_tensor = torch.from_numpy(res).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.FloatTensor)
data, target = Variable(input_tensor), Variable(target_tensor)
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(data)
print("Output size: ")
print(output.size())
print("Target size: ")
print(target.size())
# Calculate loss with respect to training labels
loss = criterion(output, target)
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
#ADAM_optimizer.step()
출력 :
대신 나는 아직도 [mini_batch, 100, C 여기서로 내 벡터에 100 개 단어가 나의 모델입니다
Epoch 0/50: 0.0%
torch.Size([3, 100])
torch.Size([3, 100, 300])
torch.Size([3, 100, 256])
Output size:
torch.Size([3, 100, 1024])
Target size:
torch.Size([3])
오류 :
ValueError: Expected 2 or 4 dimensions (got 3)
EDITED ------------------------------------------- ------------------------
지금은 모델 훈련을 받았지만 정확도는 떨어지고 있습니다. 내 LSTM 출력을 연결 한 다음 더 작은 텐서로 응축하여 선형 레이어를 통과하는 데 문제가 있습니까?
새로운 모델 : 인쇄 텐서 크기
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256, num_classes=3, lstm_layers=1):
super(myLSTM, self).__init__()
self.num_layers = lstm_layers
self.hidden_size = h_size
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=lstm_layers, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2 * h_size * 2, num_classes)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
def forward(self, s1, s2):
# Set initial states
#h0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda() # 2 for bidirection
#c0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda()
batch_size = s1.size()[0]
embeds_1 = self.embedding(s1)
embeds_2 = self.embedding(s2)
_, (h_1_last, _) = self.lstm(embeds_1)#, (h0, c0)) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)#, (h0, c0))
concat = torch.cat((h_1_last, h_2_last), dim=2) #double check the dimension
concat = concat.view(batch_size, -1)
scores = self.mlp(concat)
return scores
새로운 교육
batch_size = 64
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
num_epochs = 10
model.train()
if cuda:
model = model.cuda()
criterion = criterion.cuda()
from torch.autograd import Variable
from torch import optim
epoch_losses = []
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, 100*float(epoch)/num_epochs))
# Batch loss aggregator
losses = []
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor)
s2 = Variable(s2_tensor)
target = Variable(target_tensor)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(s1,s2)
# Calculate loss with respect to training labels
loss = criterion(output, target)
losses.append(loss.data[0])
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
# concat losses to epoch losses
epoch_losses += losses
교육 :
Epoch 0/10: 0.0%
Batch size: 64
Sentences
torch.Size([64, 50])
torch.Size([64, 50])
torch.Size([64, 50, 300])
torch.Size([64, 50, 300])
Hidden states
torch.Size([2, 64, 128])
torch.Size([2, 64, 128])
Concatenated hidden states
torch.Size([2, 64, 256])
Reshaped tensors for linear layer
torch.Size([64, 512])
Linear propogation
torch.Size([64, 3])
평가
def eval_model(model, mode='dev'):
file_name = 'snli_1.0/snli_1.0_dev.jsonl' if mode == 'dev' else 'snli_1.0/snli_1.0_test.jsonl'
dev_data, _ = obtain_data(file_name)
dev_n_data = vocab.process_data(dev_data)
print("Length of data: {}".format(len(dev_n_data)))
eval_batch_size = 1024
model.eval()
total = len(dev_n_data)
hit = 0
correct = 0
# Batch dev eval
for start, end in batch_index_gen(eval_batch_size, len(dev_n_data)):
s1, s2, y = convert_to_numpy(dev_n_data[start:end])
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor, volatile=True)
s2 = Variable(s2_tensor, volatile=True)
target = Variable(target_tensor, volatile=True)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
output = model.forward(s1,s2)
loss = criterion(output, target)
#print("output size: {}".format(output.size()))
#print("target size: {}".format(target.size()))
pred = output.data.max(1)[1] # get the index of the max log-probability
#print(pred[:5])
#print(output[:])
correct += pred.eq(target.data).cpu().sum()
return correct/float(total)
eval_model(model)
글쎄, 당신의 출력 크기가 [3, 100, 1024] 동안'CrossEntropyLoss'는 말한다 : "이'input' 각 클래스에 대한 점수를 포함 할 것으로 예상된다 'input'는 2D'이어야한다. Tensor' 크기의 'batch xn' 이 기준은 크기가 n 인 1D 텐서의 각 값에 대해 'target'으로 클래스 색인 (0에서 nClasses-1)을 기대합니다. " –