我有图像, 例如。但所有图像的大小都调整为 120x80。我需要识别图像中的内容:数字(1 到 9)或字母(完整英文字母表)。但我的模型没有学习。它只是在错误〜3.6处停止(CrossEntropyLoss,35个类)。
然后我尝试查看每层之后输出的图像,并且在块 3 之后(参见下面的模型)它们绝对相同(有孤立的例外),仅保留白色背景。我的对象(数字/字母)没有进入下一层。我尝试增加 Conv2d 内核的大小,减少过滤器的数量,但它不起作用。
更改:我使用 pytorch。在训练期间,Adam 优化器使用 lr = 0.001,batch_size 尝试了 32、64 - 两者都不起作用。数据集分为 20% - 验证集,80% - 训练集。我尝试训练 100 和 500 epoch,结果是相同的:(蓝色 - 训练集,黄色 - 验证集)。
训练代码:
from torch.utils.data import DataLoader, random_split
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torchvision import transforms
import matplotlib.pyplot as plt
from dataset.dataset import CellsDataset
from model import RecognitionModel
batch_size = 32
epochs = 100
lr = 0.001
transform = transforms.Compose([
transforms.Resize((80, 120)), # Изменение размера изображений
# transforms.RandomHorizontalFlip(), # Случайное горизонтальное отражение
# transforms.RandomRotation(20), # Случайное вращение на 20 градусов
# transforms.RandomAffine(degrees=15, translate=(0.1, 0.1)), # Случайная аффинная трансформация
transforms.Grayscale(),
transforms.ToTensor(), # Преобразование в тензор
])
dataset = CellsDataset(transform)
train_dataset, valid_dataset = random_split(dataset, [0.8, 0.2])
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
valid_dataloader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = True)
model = RecognitionModel()
loss_func = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr = lr)
train_losses = []
valid_losses = []
for i in range(epochs):
# Train
optimizer.zero_grad()
image, label = next(iter(train_dataloader))
pred = model(image)
loss = loss_func(pred, label)
train_losses.append(loss.item())
# Validation
image, label = next(iter(valid_dataloader))
pred = model(image)
loss_ = loss_func(pred, label)
valid_losses.append(loss_.item())
# Backward
loss.backward()
optimizer.step()
print(f"Epoch {i+1}/{epochs} Loss {loss.item()} Validation loss {loss_.item()}")
型号代码:
import torch
from torch.nn import Module, Conv2d, MaxPool2d, ReLU, AdaptiveMaxPool2d, Linear, LeakyReLU, Softmax
class CNNBlock(Module):
def __init__(self, in_channels, out_channels):
super().__init__()
self.conv = Conv2d(
in_channels = in_channels,
out_channels = out_channels,
kernel_size = 5,
padding = "same"
)
self.act = ReLU()
self.maxpool = MaxPool2d(
kernel_size = 2,
stride = 2
)
def forward(self, x):
return self.maxpool(self.act(self.conv(x)))
class RecognitionModel(Module):
def __init__(self):
super().__init__()
self.block1 = CNNBlock(1, 32)
self.block2 = CNNBlock(32, 64)
self.block3 = CNNBlock(64, 128)
self.block4 = CNNBlock(128, 256)
self.conv1 = Conv2d(
in_channels = 256,
out_channels = 512,
kernel_size = 3
)
self.act1 = ReLU()
self.conv2 = Conv2d(
in_channels = 512,
out_channels = 1024,
kernel_size = 3
)
self.act2 = ReLU()
self.globalmaxpool = AdaptiveMaxPool2d(output_size = 1)
self.sqz = lambda x: x.squeeze()
self.linear1 = Linear(
in_features = 1024,
out_features = 512
)
self.act3 = LeakyReLU()
self.linear2 = Linear(
in_features = 512,
out_features = 256
)
self.act4 = LeakyReLU()
self.linear3 = Linear(
in_features = 256,
out_features = 128
)
self.act5 = LeakyReLU()
self.linear4 = Linear(
in_features = 128,
out_features = 64
)
self.act6 = LeakyReLU()
self.linear5 = Linear(
in_features = 64,
out_features = 35
)
self.act7 = Softmax()
def forward(self, x):
x = self.block1(x)
x = self.block2(x)
x = self.block3(x)
x = self.block4(x)
x = self.act1(self.conv1(x))
x = self.act2(self.conv2(x))
x = self.globalmaxpool(x)
x = self.sqz(x)
x = self.act3(self.linear1(x))
x = self.act4(self.linear2(x))
x = self.act5(self.linear3(x))
x = self.act6(self.linear4(x))
y = self.act7(self.linear5(x))
return y
组装数据集的代码:
import json
import torch
from torch.utils.data import Dataset
from PIL import Image
class CellsDataset(Dataset):
def __init__(self, transform):
self.classes = "123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
self.transform = transform
with open("dataset/labels.json", 'r') as labels:
self.labels = json.load(labels)
def __len__(self):
return len(self.labels)
def __getitem__(self, index: int):
image_path = "dataset/images/" + self.labels[index]["image"]
label = self.labels[index]["choice"]
label_index = self.classes.index(label.upper())
label_hot_encoding = torch.zeros(len(self.classes))
label_hot_encoding[label_index] = 1
image = Image.open(image_path)
return self.transform(image), label_hot_encoding