1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
| class GGMLQuantizer: """ GGML量化配置 """ QUANT_TYPES = { "q4_0": { "bits": 4, "description": "4-bit 量化, 传统方法", "memory_factor": 4.5 }, "q4_1": { "bits": 4, "description": "4-bit 量化, 带scale", "memory_factor": 4.7 }, "q5_0": { "bits": 5, "description": "5-bit 量化", "memory_factor": 5.5 }, "q5_1": { "bits": 5, "description": "5-bit 量化, 带scale", "memory_factor": 5.7 }, "q8_0": { "bits": 8, "description": "8-bit 量化, 几乎无损", "memory_factor": 8.5 }, "f16": { "bits": 16, "description": "半精度", "memory_factor": 2.0 } } def quantize_model(self, input_path: str, output_path: str, quant_type: str = "q4_0"): """ 调用llama.cpp量化 """ import subprocess cmd = [ "./llama.cpp/quantize", input_path, output_path, quant_type ] result = subprocess.run(cmd, capture_output=True, text=True) return result.returncode == 0
|