|-转 测试模型训练的小的python脚本
简单的一个测试脚本,用来测硬件做深度学习训练时的性能。
# train_simple.py
import torch
import time
from diffusers import UNet2DModel
from pathlib import Path
import platform
import psutil
# -----------------------------
# 1. 系统信息打印
# -----------------------------
def print_system_info():
print("? 系统信息检测中...")
cpu_name = platform.processor()
if not cpu_name:
try:
with open("/proc/cpuinfo", "r") as f:
for line in f:
if "model name" in line:
cpu_name = line.split(":")[1].strip().split(" @ ")[0]
break
except:
cpu_name = "Unknown CPU"
cpu_cores = psutil.cpu_count(logical=False)
cpu_threads = psutil.cpu_count(logical=True)
print(f"? CPU: {cpu_name} ({cpu_cores} 核 / {cpu_threads} 线程)")
memory = psutil.virtual_memory()
total_ram = memory.total / (1024**3)
available_ram = memory.available / (1024**3)
print(f"? 内存: {total_ram:.2f} GB 总计, {available_ram:.2f} GB 可用")
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
print(f"? GPU: {gpu_count} 张可用 (CUDA)")
for i in range(gpu_count):
gpu_name = torch.cuda.get_device_name(i)
print(f" ? [{i}] {gpu_name}")
else:
print("? GPU: 未检测到 (CUDA 不可用)")
print("-" * 50)
print_system_info()
print("? 开始执行 UNet2DModel 分阶段训练测试...")
# -----------------------------
# 2. 全局计时开始
# -----------------------------
total_start_time = time.time()
# 创建模型(不指定设备)
model = UNet2DModel(
sample_size=32,
in_channels=3,
out_channels=3,
layers_per_block=1,
block_out_channels=(32, 64),
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
)
# 模拟数据
batch_size = 2
x_cpu = torch.randn(batch_size, 3, 32, 32)
y_cpu = torch.randn(batch_size, 3, 32, 32)
timesteps_cpu = torch.zeros(batch_size, dtype=torch.long)
# 显示模型参数量
def count_params(m):
return sum(p.numel() for p in m.parameters() if p.requires_grad)
print(f"? 模型参数量: {count_params(model):,}")
# 优化器(后面会重建)
optimizer = None
# -----------------------------
# 3. 阶段1:CPU 训练
# -----------------------------
print("? 开始 CPU 训练...")
cpu_start_time = time.time()
# 使用 CPU
model_cpu = UNet2DModel(
sample_size=32,
in_channels=3,
out_channels=3,
layers_per_block=1,
block_out_channels=(32, 64),
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
)
model_cpu.train()
model_cpu = model_cpu.to("cpu")
optimizer = torch.optim.Adam(model_cpu.parameters(), lr=1e-3)
n_steps = 10
for step in range(n_steps):
optimizer.zero_grad()
output = model_cpu(x_cpu, timesteps_cpu).sample
loss = torch.nn.functional.mse_loss(output, y_cpu)
loss.backward()
optimizer.step()
print(f"Step {step+1:2d} | Device: CPU | Loss: {loss.item():.6f}")
cpu_end_time = time.time()
cpu_elapsed = cpu_end_time - cpu_start_time
print(f"⏱️ CPU 训练耗时: {cpu_elapsed:.2f} 秒")
# -----------------------------
# 4. 阶段2:GPU 训练(如果可用)
# -----------------------------
gpu_elapsed = 0.0
if torch.cuda.is_available():
print("? 开始 GPU 训练...")
gpu_start_time = time.time()
model_gpu = UNet2DModel(
sample_size=32,
in_channels=3,
out_channels=3,
layers_per_block=1,
block_out_channels=(32, 64),
down_block_types=("DownBlock2D", "AttnDownBlock2D"),
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
)
model_gpu.train()
device = torch.device("cuda")
model_gpu = model_gpu.to(device)
x_gpu = x_cpu.to(device)
y_gpu = y_cpu.to(device)
timesteps_gpu = timesteps_cpu.to(device)
optimizer = torch.optim.Adam(model_gpu.parameters(), lr=1e-3)
for step in range(n_steps):
optimizer.zero_grad()
output = model_gpu(x_gpu, timesteps_gpu).sample
loss = torch.nn.functional.mse_loss(output, y_gpu)
loss.backward()
optimizer.step()
print(f"Step {step+1:2d} | Device: GPU | Loss: {loss.item():.6f}")
gpu_end_time = time.time()
gpu_elapsed = gpu_end_time - gpu_start_time
print(f"⏱️ GPU 训练耗时: {gpu_elapsed:.2f} 秒")
else:
print("⚠️ GPU 不可用,跳过 GPU 训练阶段")
# -----------------------------
# 5. 保存模型(仅保存最后一次训练的模型)
# -----------------------------
save_dir = Path("output") / "test-unet-final"
save_dir.mkdir(parents=True, exist_ok=True)
# 保存 GPU 模型优先,否则保存 CPU
final_model = model_gpu if torch.cuda.is_available() else model_cpu
final_model.eval()
final_model.save_pretrained(save_dir)
print(f"? 模型已保存至: {save_dir.resolve()}")
# 清理命令
if platform.system() == "Windows":
delete_cmd = f'rd /s /q "{save_dir}"'
else:
delete_cmd = f'rm -rf "{save_dir}"'
print(f"? 清理命令(如需删除): {delete_cmd}")
# -----------------------------
# 6. 总耗时统计
# -----------------------------
total_end_time = time.time()
total_elapsed = total_end_time - total_start_time
print(f"✅ 总训练耗时: {total_elapsed:.2f} 秒")
print(f"? 详细耗时: CPU={cpu_elapsed:.2f}s, GPU={gpu_elapsed:.2f}s")
print("? 所有训练阶段完成!环境一切正常,可开始真实训练。")20250903...
浏览更多内容请先登录。
立即注册
更新于:2025-09-03 14:01:45
相关内容
python 报错整理2
给出一份「Python 中常见 import 名称 与 pip install 名称不一致」的速查表
Windows用Conda创建环境报错 CondaToSNonInteractiveError: Terms of Se...
快速验证顺序是否被改
Python 学习整理2
fastapi-speaker-extractor whisperx 项目报错 ValueError: The chosen ...
python报错 UnicodeEncodeError: 'gbk' codec can't encode character '\u...
python whisperx 报错 in load_align_model raise ValueError(f'The ch...
pyannote/embedding 模型是真难引入模型
Trae 或者是我自己 莫名奇妙创建了个文件,影响了项目代码的运行。
WhisperX 无法加载模型
HUGGINGFACE_HUB_CACHE 设置错误导致的问题
Trae的bug太多了,怪不得免费
通义之旅
通义之旅2
目标说话人声音提取模型训练的思路
python报错 can't convert cuda:0 device type tensor to numpy. Use Tenso...
Expected all tensors to be on the same device, but found at least two ...
腾讯元宝推荐的项目结构(音频处理项目)
音频处理项目fse
各种python 相关命令
python 报错 SyntaxError: 'return' outside function
python常用命令
腾讯编程助手
python一些扩展兼容安装的处理方案
Python与模型相关知识以及问题的整理之二
通义千问:为什么说 Shell 函数真香?
在 Windows 11 中,要让 PowerShell 启动时自动激活 conda activate train...
CoPilot用Claude Sonnet 4模型调试多平台自动安装python训练模型或机器学习环境
Python与模型相关知识以及问题的整理
学习模型蒸馏之蒸馏SoloSpeech 2025年8月
Google colab 测试运行SoloSpeech蒸馏项目中教师模型的训练
学习模型蒸馏之蒸馏SoloSpeech 2025年8月 与通义对话
腾讯云代码助手(Tencent Cloud CodeBuddy)插件在VS Code上
线上训练数据的一些技巧
云主机选择 试试 DigitalOcean 毕竟有新加坡服务器。
云盘选择
Linux/Ubuntu服务器命令行使用百度网盘
SoloSpeech 模型训练终于有了眉目 20250829 2325
各种和模型训练相关的工具
相关问题报错
python 调式代码的几种方法
python报错 ModuleNotFoundError: No module named 'solospeech'
如何用有效的用conda安装python扩展
SoloSpeech 训练的扩展安装
python的一些包或扩展依赖于torch,会在安装的时候安装上torch的CPU版
模型训练过程中的报错 unexpected pos 88457920 vs 88457808
模型训练平台汇总
Copilot的能力不低,不可小觑 20250902
关于魔塔的静默提示,解决静默提醒提示。
python -m py_compile "d:\python\SoloSpeech\solospeech\stable_audio_v...
线上平台和CPU服务器压力测试
python 和 conda
启动powershell,conda 自动激活base
推荐内容