学习顺序说明:本文是AI学习路线的第9篇,建议按顺序学习:
- 01 入门基础 → … → 08 AI工具链 → 09 计算机视觉(本文)→ 10 多模态大模型 → 11 AI Agent → 12 项目实战
计算机视觉是AI的重要分支,让机器能够”看懂”图像和视频。本文介绍计算机视觉的核心技术和应用。
计算机视觉概述
主要任务
计算机视觉
├── 图像分类 (Image Classification)
├── 目标检测 (Object Detection)
├── 图像分割 (Image Segmentation)
│ ├── 语义分割
│ └── 实例分割
├── 目标跟踪 (Object Tracking)
├── 图像生成 (Image Generation)
└── 视觉问答 (Visual QA)
参考资源:CS231n: CNN for Visual Recognition - Stanford经典课程
第一部分:图像处理基础
1.1 OpenCV基础操作
import cv2
import numpy as np
import matplotlib.pyplot as plt
# 读取图像
img = cv2.imread('image.jpg')
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # BGR转RGB
# 基本操作
print(f"图像尺寸: {img.shape}") # (height, width, channels)
# 调整大小
resized = cv2.resize(img, (224, 224))
# 裁剪
cropped = img[100:300, 200:400]
# 灰度化
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 模糊
blurred = cv2.GaussianBlur(img, (5, 5), 0)
# 边缘检测
edges = cv2.Canny(gray, 100, 200)
# 显示
cv2.imshow('Image', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
1.2 图像增强
# 亮度/对比度调整
def adjust_brightness_contrast(img, brightness=0, contrast=0):
if brightness != 0:
img = cv2.convertScaleAbs(img, alpha=1, beta=brightness)
if contrast != 0:
img = cv2.convertScaleAbs(img, alpha=contrast, beta=0)
return img
# 直方图均衡化
def histogram_equalization(img):
if len(img.shape) == 3:
# 彩色图像,转换到YUV空间
img_yuv = cv2.cvtColor(img, cv2.COLOR_BGR2YUV)
img_yuv[:,:,0] = cv2.equalizeHist(img_yuv[:,:,0])
return cv2.cvtColor(img_yuv, cv2.COLOR_YUV2BGR)
else:
return cv2.equalizeHist(img)
# 数据增强
from torchvision import transforms
transform = transforms.Compose([
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomRotation(15),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
参考资源:OpenCV官方文档
第二部分:图像分类
2.1 经典CNN架构
ResNet迁移学习
import torch
import torch.nn as nn
from torchvision import models
# 加载预训练ResNet
model = models.resnet50(pretrained=True)
# 修改最后一层
num_classes = 10
model.fc = nn.Linear(model.fc.in_features, num_classes)
# 冻结特征提取层
for param in model.parameters():
param.requires_grad = False
for param in model.fc.parameters():
param.requires_grad = True
# 训练
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.fc.parameters(), lr=0.001)
EfficientNet
# 使用torchvision的EfficientNet
from torchvision.models import efficientnet_b0
model = efficientnet_b0(pretrained=True)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
2.2 图像分类完整流程
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
# 数据预处理
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# 加载数据
train_dataset = datasets.ImageFolder('train/', transform=train_transform)
val_dataset = datasets.ImageFolder('val/', transform=val_transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
# 模型
model = models.resnet50(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(train_dataset.classes))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# 训练循环
def train_epoch(model, loader, criterion, optimizer):
model.train()
running_loss = 0.0
correct = 0
for inputs, labels in loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = outputs.max(1)
correct += predicted.eq(labels).sum().item()
return running_loss / len(loader), correct / len(loader.dataset)
第三部分:目标检测
3.1 目标检测概述
| 方法 | 类型 | 特点 |
|---|---|---|
| YOLO | 单阶段 | 速度快,实时检测 |
| SSD | 单阶段 | 多尺度检测 |
| Faster R-CNN | 两阶段 | 精度高,速度较慢 |
| DETR | Transformer | 端到端,无需NMS |
3.2 YOLO使用
# 使用ultralytics YOLOv8
from ultralytics import YOLO
# 加载预训练模型
model = YOLO('yolov8n.pt') # nano版本,速度快
# 推理
results = model('image.jpg')
# 显示结果
for result in results:
boxes = result.boxes # 边界框
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0] # 坐标
confidence = box.conf[0] # 置信度
class_id = box.cls[0] # 类别
print(f"类别: {model.names[int(class_id)]}, 置信度: {confidence:.2f}")
print(f"边界框: ({x1:.0f}, {y1:.0f}, {x2:.0f}, {y2:.0f})")
# 可视化
result.show()
result.save('output.jpg')
# 训练自定义数据集
model.train(data='dataset.yaml', epochs=50, imgsz=640)
3.3 目标检测评估指标
# IoU (Intersection over Union)
def calculate_iou(box1, box2):
"""计算两个边界框的IoU"""
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
inter_area = max(0, x2 - x1) * max(0, y2 - y1)
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
union_area = box1_area + box2_area - inter_area
return inter_area / union_area if union_area > 0 else 0
# mAP计算
# mAP@0.5: IoU阈值0.5时的平均精度
# mAP@0.5:0.95: IoU阈值从0.5到0.95,步长0.05的平均mAP
参考资源:YOLOv8官方文档
第四部分:图像分割
4.1 语义分割
U-Net架构
import torch
import torch.nn as nn
class UNet(nn.Module):
"""简化版U-Net"""
def __init__(self, in_channels=3, out_channels=1):
super().__init__()
# 编码器
self.enc1 = self.conv_block(in_channels, 64)
self.enc2 = self.conv_block(64, 128)
self.enc3 = self.conv_block(128, 256)
self.enc4 = self.conv_block(256, 512)
self.pool = nn.MaxPool2d(2)
# 解码器
self.up3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
self.dec3 = self.conv_block(512, 256)
self.up2 = nn.ConvTranspose2d(256, 128, 2, stride=2)
self.dec2 = self.conv_block(256, 128)
self.up1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
self.dec1 = self.conv_block(128, 64)
self.final = nn.Conv2d(64, out_channels, 1)
def conv_block(self, in_ch, out_ch):
return nn.Sequential(
nn.Conv2d(in_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
nn.Conv2d(out_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True)
)
def forward(self, x):
# 编码
e1 = self.enc1(x)
e2 = self.enc2(self.pool(e1))
e3 = self.enc3(self.pool(e2))
e4 = self.enc4(self.pool(e3))
# 解码
d3 = self.up3(e4)
d3 = torch.cat([d3, e3], dim=1)
d3 = self.dec3(d3)
d2 = self.up2(d3)
d2 = torch.cat([d2, e2], dim=1)
d2 = self.dec2(d2)
d1 = self.up1(d2)
d1 = torch.cat([d1, e1], dim=1)
d1 = self.dec1(d1)
return self.final(d1)
4.2 实例分割
使用Mask R-CNN
import torchvision
from torchvision.models.detection import maskrcnn_resnet50_fpn
# 加载预训练模型
model = maskrcnn_resnet50_fpn(pretrained=True)
model.eval()
# 推理
from PIL import Image
import torchvision.transforms as T
img = Image.open('image.jpg')
img_tensor = T.ToTensor()(img)
with torch.no_grad():
prediction = model([img_tensor])
# 解析结果
boxes = prediction[0]['boxes'] # 边界框
labels = prediction[0]['labels'] # 类别
scores = prediction[0]['scores'] # 分数
masks = prediction[0]['masks'] # 分割掩码
# 可视化掩码
for i, (box, mask) in enumerate(zip(boxes, masks)):
if scores[i] > 0.5: # 置信度阈值
mask = mask[0].numpy()
# 应用掩码
pass
参考资源:Mask R-CNN论文
第五部分:实战应用
5.1 人脸检测与识别
import cv2
# 加载人脸检测器
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
def detect_faces(img):
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)
for (x, y, w, h) in faces:
cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
return img
# 使用深度学习模型
from facenet_pytorch import InceptionResnetV1, MTCNN
mtcnn = MTCNN() # 人脸检测
resnet = InceptionResnetV1(pretrained='vggface2').eval() # 人脸特征提取
# 检测人脸
img = Image.open('face.jpg')
face = mtcnn(img)
# 提取特征向量
embedding = resnet(face.unsqueeze(0))
5.2 OCR文字识别
# 使用PaddleOCR
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
result = ocr.ocr('document.jpg', cls=True)
for line in result:
print(line)
学习资源
官方文档
经典课程
- CS231n - Stanford
- Deep Learning for Computer Vision
经典论文
最后更新: 2026年4月10日