|-转 测试模型训练的小的python脚本
简单的一个测试脚本,用来测硬件做深度学习训练时的性能。
# train_simple.py
import torch
import time
from diffusers import UNet2DModel
from pathlib import Path
import platform
import psutil
# -----------------------------
# 1. 系统信息打印
# -----------------------------
def print_system_info():
print("? 系统信息检测中...")
cpu_name = platform.processor()
if not cpu_name:
try:
with open("/proc/cpuinfo", "r") as f:
for line in f:
if "model name" in line:
cpu_name = line.split(":")[1].strip().split(" @ ")[0]
break
except:
cpu_name = "Unknown CPU"
cpu_cores = psutil.cpu_count(logical=False)
cpu_threads = psutil.cpu_count(logical=True)
print(f"? CPU: {cpu_name} ({cpu_cores} 核 / {cpu_threads} 线程)")
memory = psutil.virtual_memory()
total_ram = memory.total / (1024**3)
available_ram = memory.available / (1024**3)
print(f"? 内存: {total_ram:.2f} GB 总计, {available_ram:.2f} GB 可用")
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
print(f"? GPU: {gpu_count} 张可用 (CUDA)")
for i in range(gpu_count):
gpu_name = torch.cuda.get_device_name(i)
print(f" ? [{i}] {gpu_name}")
else:
print("? GPU: 未检测到 (CUDA 不可用)")
print("-" * 50)
print_system_info()
print("? 开始执行 UNet2DModel 分阶段训练测试...")
# -----------------------------
# 2. 全局计时开始
# -----------------------------
total_start_time = time.time()
# 创建模型(不指定设备)
model = UNet2DModel(
sample_size=32,
in_channels=3,
out_channels=3,
layers_per_block=1,
block_out_channels=(32, 64),
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
)
# 模拟数据
batch_size = 2
x_cpu = torch.randn(batch_size, 3, 32, 32)
y_cpu = torch.randn(batch_size, 3, 32, 32)
timesteps_cpu = torch.zeros(batch_size, dtype=torch.long)
# 显示模型参数量
def count_params(m):
return sum(p.numel() for p in m.parameters() if p.requires_grad)
print(f"? 模型参数量: {count_params(model):,}")
# 优化器(后面会重建)
optimizer = None
# -----------------------------
# 3. 阶段1:CPU 训练
# -----------------------------
print("? 开始 CPU 训练...")
cpu_start_time = time.time()
# 使用 CPU
model_cpu = UNet2DModel(
sample_size=32,
in_channels=3,
out_channels=3,
layers_per_block=1,
block_out_channels=(32, 64),
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
)
model_cpu.train()
model_cpu = model_cpu.to("cpu")
optimizer = torch.optim.Adam(model_cpu.parameters(), lr=1e-3)
n_steps = 10
for step in range(n_steps):
optimizer.zero_grad()
output = model_cpu(x_cpu, timesteps_cpu).sample
loss = torch.nn.functional.mse_loss(output, y_cpu)
loss.backward()
optimizer.step()
print(f"Step {step+1:2d} | Device: CPU | Loss: {loss.item():.6f}")
cpu_end_time = time.time()
cpu_elapsed = cpu_end_time - cpu_start_time
print(f"⏱️ CPU 训练耗时: {cpu_elapsed:.2f} 秒")
# -----------------------------
# 4. 阶段2:GPU 训练(如果可用)
# -----------------------------
gpu_elapsed = 0.0
if torch.cuda.is_available():
print("? 开始 GPU 训练...")
gpu_start_time = time.time()
model_gpu = UNet2DModel(
sample_size=32,
in_channels=3,
out_channels=3,
layers_per_block=1,
block_out_channels=(32, 64),
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
)
model_gpu.train()
device = torch.device("cuda")
model_gpu = model_gpu.to(device)
x_gpu = x_cpu.to(device)
y_gpu = y_cpu.to(device)
timesteps_gpu = timesteps_cpu.to(device)
optimizer = torch.optim.Adam(model_gpu.parameters(), lr=1e-3)
for step in range(n_steps):
optimizer.zero_grad()
output = model_gpu(x_gpu, timesteps_gpu).sample
loss = torch.nn.functional.mse_loss(output, y_gpu)
loss.backward()
optimizer.step()
print(f"Step {step+1:2d} | Device: GPU | Loss: {loss.item():.6f}")
gpu_end_time = time.time()
gpu_elapsed = gpu_end_time - gpu_start_time
print(f"⏱️ GPU 训练耗时: {gpu_elapsed:.2f} 秒")
else:
print("⚠️ GPU 不可用,跳过 GPU 训练阶段")
# -----------------------------
# 5. 保存模型(仅保存最后一次训练的模型)
# -----------------------------
save_dir = Path("output") / "test-unet-final"
save_dir.mkdir(parents=True, exist_ok=True)
# 保存 GPU 模型优先,否则保存 CPU
final_model = model_gpu if torch.cuda.is_available() else model_cpu
final_model.eval()
final_model.save_pretrained(save_dir)
print(f"? 模型已保存至: {save_dir.resolve()}")
# 清理命令
if platform.system() == "Windows":
delete_cmd = f'rd /s /q "{save_dir}"'
else:
delete_cmd = f'rm -rf "{save_dir}"'
print(f"? 清理命令(如需删除): {delete_cmd}")
# -----------------------------
# 6. 总耗时统计
# -----------------------------
total_end_time = time.time()
total_elapsed = total_end_time - total_start_time
print(f"✅ 总训练耗时: {total_elapsed:.2f} 秒")
print(f"? 详细耗时: CPU={cpu_elapsed:.2f}s, GPU={gpu_elapsed:.2f}s")
print("? 所有训练阶段完成!环境一切正常,可开始真实训练。")20250903...
浏览更多内容请先登录。
立即注册
更新于:2025-09-03 14:01:45
相关内容
python 报错整理2
给出一份「Python 中常见 import 名称 与 pip install 名称不一致」的速查表
Windows用Conda创建环境报错 CondaToSNonInteractiveError: Terms of Se...
快速验证顺序是否被改
Python与模型相关知识以及问题的整理
学习模型蒸馏之蒸馏SoloSpeech 2025年8月
Google colab 测试运行SoloSpeech蒸馏项目中教师模型的训练
学习模型蒸馏之蒸馏SoloSpeech 2025年8月 与通义对话
腾讯云代码助手(Tencent Cloud CodeBuddy)插件在VS Code上
线上训练数据的一些技巧
云主机选择 试试 DigitalOcean 毕竟有新加坡服务器。
云盘选择
Linux/Ubuntu服务器命令行使用百度网盘
SoloSpeech 模型训练终于有了眉目 20250829 2325
各种和模型训练相关的工具
相关问题报错
python 调式代码的几种方法
python报错 ModuleNotFoundError: No module named 'solospeech'
如何用有效的用conda安装python扩展
SoloSpeech 训练的扩展安装
python的一些包或扩展依赖于torch,会在安装的时候安装上torch的CPU版
模型训练过程中的报错 unexpected pos 88457920 vs 88457808
模型训练平台汇总
Copilot的能力不低,不可小觑 20250902
关于魔塔的静默提示,解决静默提醒提示。
python -m py_compile "d:\python\SoloSpeech\solospeech\stable_audio_v...
线上平台和CPU服务器压力测试
Python 学习整理2
fastapi-speaker-extractor whisperx 项目报错 ValueError: The chosen ...
python报错 UnicodeEncodeError: 'gbk' codec can't encode character '\u...
python whisperx 报错 in load_align_model raise ValueError(f'The ch...
pyannote/embedding 模型是真难引入模型
Trae 或者是我自己 莫名奇妙创建了个文件,影响了项目代码的运行。
WhisperX 无法加载模型
HUGGINGFACE_HUB_CACHE 设置错误导致的问题
Trae的bug太多了,怪不得免费
通义之旅
通义之旅2
目标说话人声音提取模型训练的思路
python报错 can't convert cuda:0 device type tensor to numpy. Use Tenso...
Expected all tensors to be on the same device, but found at least two ...
腾讯元宝推荐的项目结构(音频处理项目)
音频处理项目fse
各种python 相关命令
python 报错 SyntaxError: 'return' outside function
python常用命令
腾讯编程助手
python一些扩展兼容安装的处理方案
Python与模型相关知识以及问题的整理之二
通义千问:为什么说 Shell 函数真香?
在 Windows 11 中,要让 PowerShell 启动时自动激活 conda activate train...
CoPilot用Claude Sonnet 4模型调试多平台自动安装python训练模型或机器学习环境
python 和 conda
启动powershell,conda 自动激活base
推荐内容