深度学习基础 - 神经网络与PyTorch入门

前置知识：需要先掌握机器学习基础

本文重点：理解神经网络原理，掌握PyTorch基础操作

一、深度学习概述

1.1 什么是深度学习

深度学习是机器学习的一个分支，使用多层神经网络从数据中自动学习特征表示。

深度学习发展：
├── 1943: 人工神经元模型
├── 1958: 感知机
├── 1986: 反向传播算法
├── 2006: 深度信念网络
├── 2012: AlexNet (ImageNet突破)
├── 2014: GAN
├── 2017: Transformer
└── 2022: ChatGPT (大模型时代)

1.2 为什么选择PyTorch

"""
主流深度学习框架对比：
PyTorch:
- 动态计算图，调试方便
- Python风格API，易上手
- 学术研究首选
- 社区活跃
TensorFlow:
- 静态/动态图混合
- 生产部署成熟
- Google支持
JAX:
- 函数式编程
- 高性能自动微分
- 研究前沿
本教程使用PyTorch
"""

二、PyTorch基础

2.1 安装与配置

# 安装命令
# pip install torch torchvision torchaudio
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA版本: {torch.version.cuda}")
    print(f"GPU数量: {torch.cuda.device_count()}")
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

2.2 张量操作

import numpy as np
# ===== 创建张量 =====
# 从列表创建
t1 = torch.tensor([1, 2, 3, 4, 5])
print(f"一维张量: {t1}")
# 从NumPy创建
arr = np.array([1, 2, 3])
t2 = torch.from_numpy(arr)
print(f"从NumPy创建: {t2}")
# 创建特定张量
zeros = torch.zeros(3, 4)           # 全零
ones = torch.ones(2, 3)             # 全一
rand = torch.rand(3, 3)             # 均匀随机
randn = torch.randn(3, 3)           # 正态随机
arange = torch.arange(0, 10, 2)     # 序列
linspace = torch.linspace(0, 1, 5)  # 等分
# ===== 张量属性 =====
t = torch.randn(3, 4, 5)
print(f"\n张量形状: {t.shape}")      # torch.Size([3, 4, 5])
print(f"张量维度: {t.dim()}")         # 3
print(f"元素数量: {t.numel()}")       # 60
print(f"数据类型: {t.dtype}")        # torch.float32
# ===== 张量操作 =====
# 索引和切片
t = torch.arange(12).reshape(3, 4)
print(f"\n原张量:\n{t}")
print(f"第一行: {t[0]}")
print(f"第一列: {t[:, 0]}")
print(f"右上角2x2: \n{t[:2, -2:]}")
# 形状操作
t = torch.arange(12)
print(f"\n原形状: {t.shape}")
print(f"reshape(3,4): {t.reshape(3, 4).shape}")
print(f"view(2,6): {t.view(2, 6).shape}")
print(f"转置: {t.reshape(3, 4).T.shape}")
# 数学运算
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])
print(f"\n加法: {a + b}")
print(f"乘法: {a * b}")
print(f"点积: {torch.dot(a, b)}")
print(f"范数: {torch.norm(a)}")
# 矩阵运算
A = torch.randn(3, 4)
B = torch.randn(4, 2)
print(f"\n矩阵乘法: {torch.mm(A, B).shape}")
print(f"或使用@: {(A @ B).shape}")
# ===== 广播 =====
a = torch.ones(3, 1)
b = torch.ones(1, 4)
print(f"\n广播后形状: {(a + b).shape}")  # (3, 4)
# ===== GPU操作 =====
if torch.cuda.is_available():
    t_gpu = t.cuda()  # 或 t.to('cuda')
    print(f"\nGPU张量: {t_gpu.device}")
    
    # GPU与CPU转换
    t_cpu = t_gpu.cpu()  # 或 t_gpu.to('cpu')

2.3 自动微分

# ===== 基础自动微分 =====
# 创建需要梯度的张量
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
# 计算图
z = x * y + x ** 2
# 反向传播
z.backward()
print(f"z = {z.item()}")
print(f"∂z/∂x = {x.grad}")  # y + 2x = 3 + 4 = 7
print(f"∂z/∂y = {y.grad}")  # x = 2
# ===== 复杂计算图 =====
x = torch.randn(3, requires_grad=True)
y = x * 2
while y.data.norm() < 1000:
    y = y * 2
print(f"\ny = {y}")
# 向量梯度
v = torch.tensor([0.1, 1.0, 0.0001])
y.backward(v)
print(f"梯度: {x.grad}")
# ===== 梯度管理 =====
# 清零梯度
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = x.sum()
y.backward()
print(f"\n第一次梯度: {x.grad}")
x.grad.zero_()  # 梯度清零
y = x.sum()
y.backward()
print(f"清零后梯度: {x.grad}")
# 停止梯度追踪
with torch.no_grad():
    y = x * 2
    print(f"no_grad内requires_grad: {y.requires_grad}")
# 或使用detach
y = x.detach()

三、神经网络基础

3.1 神经元与激活函数

import torch.nn.functional as F
import matplotlib.pyplot as plt
# ===== 常用激活函数 =====
x = torch.linspace(-5, 5, 100)
# Sigmoid
sigmoid = torch.sigmoid(x)
# Tanh
tanh = torch.tanh(x)
# ReLU
relu = F.relu(x)
# LeakyReLU
leaky_relu = F.leaky_relu(x, 0.1)
# GELU (Transformer常用)
gelu = F.gelu(x)
# Softmax
softmax = F.softmax(torch.randn(5), dim=0)
# 可视化
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
activations = [
    ('Sigmoid', sigmoid),
    ('Tanh', tanh),
    ('ReLU', relu),
    ('LeakyReLU', leaky_relu),
    ('GELU', gelu),
]
for ax, (name, act) in zip(axes.ravel(), activations):
    ax.plot(x.numpy(), act.numpy())
    ax.set_title(name)
    ax.grid(True, alpha=0.3)
    ax.axhline(y=0, color='k', linewidth=0.5)
    ax.axvline(x=0, color='k', linewidth=0.5)
axes.ravel()[-1].axis('off')
plt.tight_layout()
plt.savefig('activation_functions.png', dpi=100, bbox_inches='tight')
plt.close()
print("激活函数特点:")
print("- Sigmoid: 输出(0,1)，梯度消失问题")
print("- Tanh: 输出(-1,1)，零中心")
print("- ReLU: 计算快，解决梯度消失，死神经元问题")
print("- LeakyReLU: 解决ReLU死神经元")
print("- GELU: Transformer常用，平滑")

3.2 构建神经网络

import torch.nn as nn
# ===== 方法1：使用nn.Sequential =====
model_seq = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(128, 10),
    nn.Softmax(dim=1)
)
print("Sequential模型:")
print(model_seq)
# ===== 方法2：自定义nn.Module =====
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, dropout_rate=0.2):
        super(NeuralNetwork, self).__init__()
        
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.fc3 = nn.Linear(hidden_size // 2, num_classes)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.batch_norm1 = nn.BatchNorm1d(hidden_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size // 2)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.batch_norm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        x = self.batch_norm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc3(x)
        return x
# 创建模型
model = NeuralNetwork(input_size=784, hidden_size=256, num_classes=10)
model = model.to(device)
print("\n自定义模型:")
print(model)
# 查看参数
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n总参数量: {total_params:,}")
print(f"可训练参数: {trainable_params:,}")
# ===== 前向传播测试 =====
batch_size = 32
x = torch.randn(batch_size, 784).to(device)
output = model(x)
print(f"\n输入形状: {x.shape}")
print(f"输出形状: {output.shape}")

3.3 损失函数与优化器

import torch.optim as optim
# ===== 损失函数 =====
# 分类问题
ce_loss = nn.CrossEntropyLoss()          # 多分类
bce_loss = nn.BCELoss()                  # 二分类
bce_logits_loss = nn.BCEWithLogitsLoss() # 二分类（含sigmoid）
# 回归问题
mse_loss = nn.MSELoss()                  # 均方误差
mae_loss = nn.L1Loss()                   # 平均绝对误差
smooth_l1_loss = nn.SmoothL1Loss()       # Smooth L1
# 示例
logits = torch.randn(5, 10)  # 5个样本，10类
target = torch.tensor([0, 1, 2, 3, 4])  # 真实标签
loss = ce_loss(logits, target)
print(f"交叉熵损失: {loss.item():.4f}")
# ===== 优化器 =====
model = NeuralNetwork(784, 256, 10)
# SGD
optimizer_sgd = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# Adam
optimizer_adam = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
# AdamW (带权重衰减)
optimizer_adamw = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
# 学习率调度器
scheduler_step = optim.lr_scheduler.StepLR(optimizer_adam, step_size=10, gamma=0.1)
scheduler_cosine = optim.lr_scheduler.CosineAnnealingLR(optimizer_adam, T_max=50)
# ===== 梯度裁剪 =====
# 防止梯度爆炸
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

四、训练循环

4.1 完整训练流程

from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# ===== 数据准备 =====
digits = load_digits()
X, y = digits.data, digits.target
# 标准化
scaler = StandardScaler()
X = scaler.fit_transform(X)
# 划分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 转换为PyTorch张量
X_train_t = torch.FloatTensor(X_train)
y_train_t = torch.LongTensor(y_train)
X_test_t = torch.FloatTensor(X_test)
y_test_t = torch.LongTensor(y_test)
# 创建DataLoader
train_dataset = TensorDataset(X_train_t, y_train_t)
test_dataset = TensorDataset(X_test_t, y_test_t)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# ===== 模型、损失函数、优化器 =====
model = NeuralNetwork(input_size=64, hidden_size=128, num_classes=10)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# ===== 训练函数 =====
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for X_batch, y_batch in dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        # 前向传播
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 统计
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += y_batch.size(0)
        correct += predicted.eq(y_batch).sum().item()
    
    return total_loss / len(dataloader), correct / total
# ===== 评估函数 =====
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += y_batch.size(0)
            correct += predicted.eq(y_batch).sum().item()
    
    return total_loss / len(dataloader), correct / total
# ===== 训练循环 =====
num_epochs = 50
train_losses, train_accs = [], []
test_losses, test_accs = [], []
print("=== 开始训练 ===")
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)
    
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    test_losses.append(test_loss)
    test_accs.append(test_acc)
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
        print(f"  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
# ===== 可视化训练过程 =====
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(train_losses, label='Train Loss')
axes[0].plot(test_losses, label='Test Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('训练损失曲线')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[1].plot(train_accs, label='Train Acc')
axes[1].plot(test_accs, label='Test Acc')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('训练准确率曲线')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('training_curves.png', dpi=100, bbox_inches='tight')
plt.close()

4.2 模型保存与加载

# ===== 保存整个模型 =====
torch.save(model, 'model_full.pth')
# ===== 只保存参数（推荐）=====
torch.save(model.state_dict(), 'model_weights.pth')
# ===== 加载模型 =====
# 方法1：加载完整模型
loaded_model = torch.load('model_full.pth')
# 方法2：加载参数到新模型
model_new = NeuralNetwork(input_size=64, hidden_size=128, num_classes=10)
model_new.load_state_dict(torch.load('model_weights.pth'))
model_new.eval()
# ===== 保存检查点 =====
checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'train_loss': train_loss,
}
torch.save(checkpoint, 'checkpoint.pth')
# 加载检查点
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

参考资源

PyTorch官方文档 - 完整API参考

PyTorch官方教程 - 官方学习资源

Deep Learning Book - Goodfellow经典教材

CS231n: CNN for Visual Recognition - 斯坦福课程

Dive into Deep Learning - 交互式深度学习教材

PyTorch Lightning - 简化训练流程

fast.ai - 实战深度学习课程

Neural Networks and Deep Learning - 在线教程

下一篇：CNN卷积神经网络返回：深度学习基础 最后更新: 2026年4月14日

访问 --

搜索文章、标签、项目线索

深度学习基础 - 神经网络与PyTorch入门

一、深度学习概述

1.1 什么是深度学习

1.2 为什么选择PyTorch

二、PyTorch基础

2.1 安装与配置

2.2 张量操作

2.3 自动微分

三、神经网络基础

3.1 神经元与激活函数

3.2 构建神经网络

3.3 损失函数与优化器

四、训练循环

4.1 完整训练流程

4.2 模型保存与加载

参考资源

讨论与反馈