Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
73 commits
Select commit Hold shift + click to select a range
874c341
support splits in convert.py
christianazinn Apr 27, 2024
72cbd4e
Support split by size and dry run to write estimated shards/filesizes
christianazinn Apr 28, 2024
702a744
Move split functionality to new GGUFManager class
christianazinn Apr 28, 2024
c33bdf3
fix improper function signature
christianazinn Apr 29, 2024
b7c6120
tentative push of convert-hf-to-gguf support
christianazinn May 5, 2024
14b3291
Merge branch 'master' into convert-split
mofosyne May 9, 2024
87a98a5
resolve merge + SplitArguments for easier parsing
christianazinn May 10, 2024
2dd7841
Merge remote-tracking branch 'origin' into convert-split
christianazinn May 23, 2024
3ff27ef
Fix eager tensor memory leak and remove convert.py changes
christianazinn May 23, 2024
6b5c375
refactor SplitStrategy to be a deque
christianazinn May 24, 2024
09baf2f
fix Q8 quantization
christianazinn Jun 3, 2024
240243e
remove unnecessary imports in gguf_manager
christianazinn Jun 3, 2024
140eb52
Merge branch 'master' into convert-split
christianazinn Jun 3, 2024
a9c7703
fix final? merge issue
christianazinn Jun 3, 2024
efead04
fix gguf_writer placement and remove comments
christianazinn Jun 3, 2024
c8ecbc6
oops, actually fix gguf_writer placement
christianazinn Jun 3, 2024
3e9430d
reduce duplicated code from gguf_writer
christianazinn Jun 5, 2024
f6fd3ea
further simplify GGUFManager
christianazinn Jun 5, 2024
bb5ee02
simplify even further and standardize with GGUFWriter
christianazinn Jun 5, 2024
5ad397d
reduce diffs with master
christianazinn Jun 5, 2024
ce7e698
form shards while adding tensors, SHA256 sums agree with master
christianazinn Jun 5, 2024
706bd69
re-add type hint
christianazinn Jun 6, 2024
6a05183
GGUFWriter compatibility fix
christianazinn Jun 6, 2024
3328b0a
Shard dataclass and un-negative dont_add_architecture
christianazinn Jun 6, 2024
1cbab22
type consistency in format_n_bytes_to_str
christianazinn Jun 6, 2024
2037eab
move kv keys to constants.py
christianazinn Jun 6, 2024
83e4a3f
make pathlib explicit
christianazinn Jun 6, 2024
13ffe22
base-1024 bytes to base-1000
christianazinn Jun 6, 2024
6d3a256
rename GGUFManager to GGUFWriterSplit
christianazinn Jun 7, 2024
1312e28
Update gguf-py/gguf/constants.py
christianazinn Jun 7, 2024
5f29d4a
fix convert-hf-to-gguf.py permissions
christianazinn Jun 7, 2024
0283fc1
fix line endings
christianazinn Jun 7, 2024
dc5cf5f
Update gguf-py/gguf/gguf_writer_split.py
christianazinn Jun 7, 2024
e093dfb
convert-hf : restore executable file permission
compilade Jun 7, 2024
9576965
examples/convert-legacy-llama.py: restore executable file permission
christianazinn Jun 8, 2024
c6ae1d6
reinstate original gguf package import and fix type annotation
christianazinn Jun 8, 2024
2e70fa1
attempt to appease the linter
christianazinn Jun 8, 2024
891b19c
attempt 2 to appease the linter
christianazinn Jun 8, 2024
02be0dd
attempt 3 to appease the linter
christianazinn Jun 8, 2024
f658e91
comma consistency
christianazinn Jun 8, 2024
079dfe3
Update convert-hf-to-gguf.py
christianazinn Jun 8, 2024
282e71f
edit cmd line args
christianazinn Jun 9, 2024
666bb09
Merge branch 'master' into convert-split
christianazinn Jun 9, 2024
03cc9bc
use simplification from #7827
christianazinn Jun 9, 2024
97dd416
kv/ti data are still wrong
christianazinn Jun 9, 2024
ff2dd7d
try to refactor kv data (still fails)
christianazinn Jun 9, 2024
ba1be97
fix ti data messiness
christianazinn Jun 9, 2024
69d6e7a
Merge branch 'master' into convert-split
christianazinn Jun 9, 2024
0779f2f
tidy up
christianazinn Jun 9, 2024
a234bf8
fix linting
christianazinn Jun 9, 2024
49b9fbe
actually make the linter happy
christianazinn Jun 9, 2024
0471f67
cleanup round 1
christianazinn Jun 9, 2024
5a96b8f
remove SplitStrategy, SplitArguments
christianazinn Jun 9, 2024
f7ecd99
appease linter
christianazinn Jun 9, 2024
9d7f694
fix typing and clean up
christianazinn Jun 9, 2024
0417104
fix linting
christianazinn Jun 9, 2024
70a6bc9
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 9, 2024
1e2d9cb
progress bar, fix split logic
christianazinn Jun 9, 2024
f7e7983
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 10, 2024
79bd2bf
catch oversights
christianazinn Jun 10, 2024
7eea552
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 10, 2024
99f9a24
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 10, 2024
ad02c94
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 10, 2024
c1b1a29
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 10, 2024
4550826
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 10, 2024
efa0609
swap bar orders
christianazinn Jun 10, 2024
b843445
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 10, 2024
854bd64
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 10, 2024
05b183f
compatibility fix
christianazinn Jun 10, 2024
e9895d2
Update gguf-py/gguf/gguf_writer.py
christianazinn Jun 10, 2024
4e4e376
Merge branch 'master' into convert-split
christianazinn Jun 15, 2024
163712e
Update convert-hf-to-gguf.py
mofosyne Jun 23, 2024
6e4182c
Merge branch 'master' into convert-split
christianazinn Jun 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix eager tensor memory leak and remove convert.py changes
Removed a memory leak caused by unexpected reference retention to eager tensors.

Also removed GGUFManager functionality in convert.py in favor of specializing for convert-hf-to-gguf.py.
  • Loading branch information
christianazinn committed May 23, 2024
commit 3ff27efa89a42afebf51cbfbc0964f81b479babd
2 changes: 1 addition & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2570,7 +2570,7 @@ def main() -> None:
if args.split_max_tensors and args.split_max_size:
raise ValueError("Can't specify both --split-max-tensors and --split-max-size")

split_arguments = gguf.SplitArguments(args) if args.split else gguf.SplitArguments()
split_arguments = gguf.SplitArguments(args=args) if args.split else gguf.SplitArguments()

ftype_map = {
"f32": gguf.LlamaFileType.ALL_F32,
Expand Down
70 changes: 32 additions & 38 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,14 @@
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
# TEMPORARY IMPORT - TODO REMOVE
import importlib
gguf = importlib.import_module("gguf-py.gguf")
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional

import numpy as np
from sentencepiece import SentencePieceProcessor

if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
# import gguf
import gguf

if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias
Expand Down Expand Up @@ -1103,8 +1100,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)


class OutputFile:
def __init__(self, fname_out: Path, split_arguments: gguf.SplitArguments, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], split_arguments, endianess=endianess)
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

def add_meta_model(self, params: Params, metadata: Metadata) -> None:
# Metadata About The Model And Its Provenence
Expand Down Expand Up @@ -1204,15 +1201,21 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
svocab.add_to_gguf(self.gguf)

def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
n_elements = int(np.prod(tensor.shape))
raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)

def write_meta(self) -> None:
self.gguf.write_to_file(meta_only=True)
self.gguf.write_header_to_file()
self.gguf.write_kv_data_to_file()

def write_tensors(self, ftype: GGMLFileType, concurrency: int) -> None:
self.gguf.write_to_file(ftype=ftype, concurrency=concurrency, write_tensor_data=OutputFile.write_tensor_data)
def write_tensor_info(self) -> None:
self.gguf.write_ti_data_to_file()

# really awkward with how this is managed with gguf_manager.py: maybe refactor at some point?
@staticmethod
def write_tensor_data(ftype: GGMLFileType, model: LazyModel, concurrency: int, writer: gguf.GGUFWriter) -> None:
def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
Expand All @@ -1230,7 +1233,7 @@ def write_tensor_data(ftype: GGMLFileType, model: LazyModel, concurrency: int, w
logger.info(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
writer.write_tensor_data(ndarray)
self.gguf.write_tensor_data(ndarray)

def close(self) -> None:
self.gguf.close()
Expand All @@ -1242,7 +1245,7 @@ def write_vocab_only(
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)

of = OutputFile(fname_out, gguf.SplitArguments(), endianess=endianess)
of = OutputFile(fname_out, endianess=endianess)

# meta data
of.add_meta_model(params, metadata)
Expand Down Expand Up @@ -1270,11 +1273,13 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
@staticmethod
def write_all(
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
split_arguments: gguf.SplitArguments, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False, metadata: Metadata = None,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
metadata: Metadata = None,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
of = OutputFile(fname_out, split_arguments, endianess=endianess)

of = OutputFile(fname_out, endianess=endianess)

# meta data
of.add_meta_model(params, metadata)
Expand All @@ -1287,9 +1292,13 @@ def write_all(

# tensor info
for name, lazy_tensor in model.items():
of.gguf.add_tensor_info(name, lazy_tensor)
of.add_tensor_info(name, lazy_tensor)

of.write_meta()
of.write_tensor_info()

of.write_tensors(ftype, concurrency)
# tensor data
of.write_tensor_data(ftype, model, concurrency)

of.close()

Expand Down Expand Up @@ -1364,7 +1373,7 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
else:
raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.model_classweight")
raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)

# HF models permut or pack some of the tensors, so we need to undo that
Expand Down Expand Up @@ -1584,11 +1593,6 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
parser.add_argument("--split", action="store_true", help="split the converted model into multiple files")
parser.add_argument("--split-max-tensors", type=int, help="max tensors in each split")
parser.add_argument("--split-max-size", type=str, help="max size per split N(M|G)")
parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files")
parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
Expand Down Expand Up @@ -1622,14 +1626,6 @@ def main(args_in: list[str] | None = None) -> None:
do_dump_model(model_plus)
return

if args.split and not (args.split_max_tensors or args.split_max_size):
raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting")

if args.split_max_tensors and args.split_max_size:
raise ValueError("Can't specify both --split-max-tensors and --split-max-size")

split_arguments = gguf.SplitArguments(args) if args.split else gguf.SplitArguments()

if not args.vocab_only:
model_plus = load_some_model(args.model)
else:
Expand Down Expand Up @@ -1707,13 +1703,11 @@ def main(args_in: list[str] | None = None) -> None:
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)

params.ftype = ftype

logger.info(f"Writing {outfile}, format {ftype}")

OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, split_arguments,
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
if not args.dry_run:
logger.info(f"Wrote {outfile}")
logger.info(f"Wrote {outfile}")


if __name__ == '__main__':
Expand Down
Loading