LLaVA视觉语言模型
掌握LLaVA模型,实现图像理解与视觉问答
前置知识:需要先掌握 CLIP模型
本文重点:LLaVA使用与图像对话
一、LLaVA概述
LLaVA (Large Language-and-Vision Assistant)
架构:
- 视觉编码器: CLIP ViT
- 投影层: MLP
- 语言模型: Vicuna/LLaMA
能力:
- 图像描述生成
- 视觉问答
- 多轮图像对话
- OCR和文档理解
二、使用LLaVA
2.1 图像问答
from transformers import LlavaForConditionalGeneration, AutoProcessor
from PIL import Image
import torch
# 加载模型
model = LlavaForConditionalGeneration.from_pretrained(
"llava-hf/llava-1.5-7b-hf",
torch_dtype=torch.float16,
device_map="auto"
)
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
# 准备输入
image = Image.open("image.jpg")
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "请描述这张图片的内容"}
]
}
]
# 处理输入
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# 生成回答
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=500)
response = processor.decode(output[0], skip_special_tokens=True)
print(response)
2.2 多轮对话
class LLaVAChat:
"""LLaVA多轮对话"""
def __init__(self, model_path="llava-hf/llava-1.5-7b-hf"):
self.model = LlavaForConditionalGeneration.from_pretrained(
model_path, torch_dtype=torch.float16, device_map="auto"
)
self.processor = AutoProcessor.from_pretrained(model_path)
self.history = []
def chat(self, image_path, question):
image = Image.open(image_path)
conversation = []
for msg in self.history:
conversation.append(msg)
conversation.append({
"role": "user",
"content": [{"type": "image"}, {"type": "text", "text": question}]
})
prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = self.processor(images=image, text=prompt, return_tensors="pt")
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
with torch.no_grad():
output = self.model.generate(**inputs, max_new_tokens=300)
response = self.processor.decode(output[0], skip_special_tokens=True)
response = response.split("ASSISTANT:")[-1].strip()
self.history.append({"role": "user", "content": [{"type": "text", "text": question}]})
self.history.append({"role": "assistant", "content": [{"type": "text", "text": response}]})
return response
# 使用
chat = LLaVAChat()
answer = chat.chat("photo.jpg", "图片中有几个人?")
三、应用场景
3.1 图像描述
def generate_caption(image_path):
image = Image.open(image_path)
conversation = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": "请详细描述这张图片的内容"}
]}
]
# ... 生成代码
3.2 OCR文档理解
def understand_document(image_path, question):
conversation = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": question}
]}
]
# 适合问题:这个文档的主题是什么?表格中的数据是什么?
参考资源
返回:多模态大模型 最后更新: 2026年4月20日
讨论与反馈