diff --git a/docker/README.md b/docker/README.md index 7ca3499c1bb..08fe1dc50d5 100644 --- a/docker/README.md +++ b/docker/README.md @@ -24,7 +24,7 @@ docker compose build OR ```bash docker pull intel/ai-tools:itrex-1.3.0 -docker pull intel/ai-tools:itrex-devel-1.3.0 +docker pull intel/ai-tools:itrex-1.3.0-devel ``` ## Use Docker Image diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py index ff0397a2368..8731dcd2184 100644 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py +++ b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py @@ -601,7 +601,7 @@ def forward(self, sequence_output, pooled_output): class BertPreTrainedModel(nn.Module): """ An abstract class to handle weights initialization and - a simple interface for dowloading and loading pretrained models. + a simple interface for downloading and loading pretrained models. """ def __init__(self, config, *inputs, **kwargs): super(BertPreTrainedModel, self).__init__() @@ -663,7 +663,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_d . `model.chkpt` a TensorFlow checkpoint from_tf: should we load the weights from a locally saved TensorFlow checkpoint cache_dir: an optional path to a folder in which the pre-trained models will be cached. - state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ diff --git a/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_utils.h b/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_utils.h index eed0c3bdaad..37069e6c267 100644 --- a/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_utils.h +++ b/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_utils.h @@ -366,7 +366,7 @@ inline float get_mxfp_maxnorm(const JBLAS_DTYPE t, int ebits, int mantissa_bits) return max_norm; } -#ifndef _WIN32 +#if !defined(_WIN32) && !defined(__APPLE__) static void request_perm_xtile_data() { unsigned long bitmask; long rc; diff --git a/intel_extension_for_transformers/llm/library/jblas/jblas/kernel_jit.h b/intel_extension_for_transformers/llm/library/jblas/jblas/kernel_jit.h index 4a711736e9d..46eb4153be9 100644 --- a/intel_extension_for_transformers/llm/library/jblas/jblas/kernel_jit.h +++ b/intel_extension_for_transformers/llm/library/jblas/jblas/kernel_jit.h @@ -1228,7 +1228,7 @@ class PaddingTransInterleaveCvt : protected xbyak::JitAvx512f { jmp(ptr[reg_tmp + reg_tmp2 * sizeof(void*)], T_NEAR); // switch(rows-iterrow) ... align(sizeof(intptr_t)); L(l_tail_tbl); - db(reinterpret_cast(nullptr), sizeof(intptr_t)); // case 0 should never occur + db(reinterpret_cast(nullptr), sizeof(intptr_t)); // case 0 should never occur for (int i = 1; i < trans_cell; ++i) putL(l_tail_case[i]); for (int m_tail = 1; m_tail < trans_cell; ++m_tail) { // case (m_tail): diff --git a/intel_extension_for_transformers/llm/runtime/graph/__init__.py b/intel_extension_for_transformers/llm/runtime/graph/__init__.py index af3c79e47fd..1d8d34bb494 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/__init__.py +++ b/intel_extension_for_transformers/llm/runtime/graph/__init__.py @@ -65,7 +65,7 @@ def __import_package(self, model_type): elif model_type == "mistral": import intel_extension_for_transformers.llm.runtime.graph.mistral_cpp as cpp_model else: - raise TypeError("Unspported model type {}!".format(model_type)) + raise TypeError("Unsupported model type {}!".format(model_type)) self.module = cpp_model @staticmethod @@ -134,7 +134,7 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs): import platform sys_platform = platform.platform().lower() if threads is None: - if "windows" in sys_platform: + if "windows" in sys_platform or "macos" in sys_platform: cpu_count = os.cpu_count() generate_kwargs["threads"] = int(cpu_count) else: @@ -212,7 +212,7 @@ def eos_token_id(self): if self.model_type == 'qwen': return self.tokenizer.special_tokens['<|endoftext|>'] return self.tokenizer.eos_token_id - + def pad_token_id(self): if self.tokenizer.pad_token_id == None: if self.batch_size == 1: diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/CMakeLists.txt b/intel_extension_for_transformers/llm/runtime/graph/core/CMakeLists.txt index bcf34a9ca4b..e443458dfaa 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/core/CMakeLists.txt +++ b/intel_extension_for_transformers/llm/runtime/graph/core/CMakeLists.txt @@ -31,7 +31,7 @@ else () target_link_libraries(ne_layers PUBLIC Threads::Threads jblas::jblas ne_vec) endif() -if(NOT WIN32) +if(NOT WIN32 AND NOT APPLE) target_link_libraries(ne_layers PUBLIC rt) endif() @@ -54,7 +54,7 @@ function(add_test_target src) target_link_options(${test_target} PRIVATE -fsanitize=address) target_include_directories(${test_target} PUBLIC .) target_link_libraries(${test_target} PUBLIC Threads::Threads jblas::jblas ne_vec) - if(NOT WIN32) + if(NOT WIN32 AND NOT APPLE) target_link_libraries(${test_target} PUBLIC rt) endif() add_test(NAME ${test_target} COMMAND ${test_target}) diff --git a/intel_extension_for_transformers/llm/runtime/graph/developer_document.md b/intel_extension_for_transformers/llm/runtime/graph/developer_document.md index fef2fc4702b..628437f8e14 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/developer_document.md +++ b/intel_extension_for_transformers/llm/runtime/graph/developer_document.md @@ -79,8 +79,8 @@ graph LR; We need to implement corresponding serialization methods from pytorch format, which is mainly divided into the following three steps. -## 1.1. Hyperparamters -The term **"hyperparamters"** describes a value that is used to configure the behavior of a large language model; this is in contrast to the model's parameters, which are the weight that were derived in the training process that was used to create the model. Each model defines its own hyperparameter structure that defines the hyperparameter values accepted by that model. Valid ITREX graph files must list these values in the correct order, and each value must be represented using the correct data type. Although hyperparameters are different across models, some attributes appear in the hyperparameters for most models: +## 1.1. Hyperparameters +The term **"hyperparameters"** describes a value that is used to configure the behavior of a large language model; this is in contrast to the model's parameters, which are the weight that were derived in the training process that was used to create the model. Each model defines its own hyperparameter structure that defines the hyperparameter values accepted by that model. Valid ITREX graph files must list these values in the correct order, and each value must be represented using the correct data type. Although hyperparameters are different across models, some attributes appear in the hyperparameters for most models: - n_vocab: the size of the model's vocabulary - n_embd: the size of the model's " embedding layer", which is used during prompt ingestion. - n_layer: the number of layers in the model; each layer represents a set of weights. @@ -328,7 +328,7 @@ Most of our model examples only support single prompt processing. You need to ad +set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_link_libraries(${TARGET} PUBLIC ne_layers jblas::jblas) ``` - and and new_model to [models_CMakeLists.txt](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/llm/runtime/graph/models/CMakeLists.txt). + and new_model to [models_CMakeLists.txt](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/llm/runtime/graph/models/CMakeLists.txt). ```diff add_subdirectory(opt) add_subdirectory(bloom) diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/util.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/util.cpp index 936030174de..be86f3ff8a6 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/util.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/util.cpp @@ -33,11 +33,11 @@ int32_t get_num_physical_cores() { #elif defined(__APPLE__) && defined(__MACH__) int32_t num_physical_cores; size_t len = sizeof(num_physical_cores); - int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); + int result = syscall(SYS_sysctlbyname, "hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); if (result == 0) { return num_physical_cores; } - result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0); + result = syscall(SYS_sysctlbyname, "hw.physicalcpu", &num_physical_cores, &len, NULL, 0); if (result == 0) { return num_physical_cores; } diff --git a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py index 93bcd8cde76..3cf517b397d 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py +++ b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_llama.py @@ -165,7 +165,7 @@ def guessed(model: 'LazyModel') -> 'Params': n_mult=256, n_head=n_embd // 128, n_head_kv=n_embd // 128, - f_norm_eps=1e-5, + rms_norm_eps=1e-5, n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model), ) @@ -203,7 +203,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params': ) # LLaMA v2 70B params.json - # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, + # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, # "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params': @@ -230,8 +230,8 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: Path) -> 'Params': n_head=n_head, n_head_kv=n_head_kv, ffn_hidden_size=ffn_hidden_size, - bos_token_id = bos_token_id, - eos_token_id = eos_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, ) @staticmethod @@ -278,7 +278,7 @@ def __init__(self, fname_tokenizer: Path, params_vocab_size: int, fname_added_to def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: tokenizer = self.sentencepiece_tokenizer for i in range(self.params_vocab_size): - text: bytes + text: bytes if i < tokenizer.vocab_size(): if tokenizer.is_unknown(i): text = " \u2047 ".encode("utf-8") @@ -1086,7 +1086,7 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: self.fout.write(struct.pack("f", params.rope_theta)) self.fout.write(struct.pack("f", params.rope_scale)) - # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json + # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json # but bos_token_id = 1 in llama.cpp self.fout.write(struct.pack("i", params.bos_token_id)) self.fout.write(struct.pack("i", params.eos_token_id)) @@ -1108,10 +1108,9 @@ def write_vocab(self, vocab: Vocab) -> None: @staticmethod def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: + params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0) of = OutputFile(fname_out) - params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0, file_type=NEFileType.AllF32) - of = OutputFile(fname_out) - of.write_file_header(params) + of.write_file_header(params, file_type=NEFileType.AllF32) of.write_vocab(vocab) of.fout.close() diff --git a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py index aeb029e5ab7..8bdefe4b714 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py +++ b/intel_extension_for_transformers/llm/runtime/graph/scripts/convert_mistral.py @@ -164,7 +164,7 @@ def guessed(model: 'LazyModel') -> 'Params': n_mult=256, n_head=n_embd // 128, n_head_kv=n_embd // 128, - f_norm_eps=1e-5, + rms_norm_eps=1e-5, n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model), ) @@ -192,6 +192,7 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: Path) -> 'Params': ffn_hidden_size=ffn_hidden_size, rms_norm_eps=rms_norm_eps, rope_theta=rope_theta, + rope_scale=rope_scale, ) # LLaMA v2 70B params.json @@ -1064,8 +1065,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: self.fout.write( struct.pack("i", 1) - ) - # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json + ) + # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json # but bos_token_id = 1 in llama.cpp self.fout.write(struct.pack("i", 2)) @@ -1087,10 +1088,9 @@ def write_vocab(self, vocab: Vocab) -> None: @staticmethod def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: + params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0) of = OutputFile(fname_out) - params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0, file_type=NEFileType.AllF32) - of = OutputFile(fname_out) - of.write_file_header(params) + of.write_file_header(params, file_type=NEFileType.AllF32) of.write_vocab(vocab) of.fout.close() diff --git a/intel_extension_for_transformers/llm/runtime/graph/scripts/requirements/common.txt b/intel_extension_for_transformers/llm/runtime/graph/scripts/requirements/common.txt index 441da4dde29..a8f8b58c79f 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/scripts/requirements/common.txt +++ b/intel_extension_for_transformers/llm/runtime/graph/scripts/requirements/common.txt @@ -1,5 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.1.0+cpu +torch==2.1.0+cpu ; sys_platform != 'darwin' +torch==2.1.0 ; sys_platform == 'darwin' transformers numpy sentencepiece diff --git a/requirements.txt b/requirements.txt index 291bbba5ace..72d4784db27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ py-cpuinfo setuptools>=65 setuptools_scm[toml]>=6.2 --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.1.0+cpu +torch==2.1.0+cpu ; sys_platform != 'darwin' +torch==2.1.0 ; sys_platform == 'darwin' accelerate optimum-intel diff --git a/setup.py b/setup.py index 583a24221ff..2f1a816d839 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,7 @@ class CMakeBuild(build_ext): @staticmethod def _is_target_file(file_name: str) -> bool: - if file_name.endswith(".dll") or file_name.endswith(".exe") or file_name.endswith(".pyd"): + if file_name.endswith(".dll") or file_name.endswith(".exe") or file_name.endswith(".pyd") or file_name.endswith(".dylib"): return True if file_name.endswith(".so") or ".so." in file_name: return True @@ -234,21 +234,24 @@ def check_submodules(): end = time.time() print(f' --- Submodule initialization took {end - start:.2f} sec') except Exception: - print(' --- Submodule initalization failed') + print(' --- Submodule initialization failed') print('Please run:\n\tgit submodule update --init --recursive') sys.exit(1) if __name__ == '__main__': - ext_modules = [CMakeExtension( - "intel_extension_for_transformers.qbits", 'intel_extension_for_transformers/llm/operator/csrc', lib_only=True)] + ext_modules = [] + if sys.platform != "darwin": + ext_modules.append(CMakeExtension("intel_extension_for_transformers.qbits", + "intel_extension_for_transformers/llm/operator/csrc", lib_only=True)) if not SKIP_RUNTIME: check_submodules() - ext_modules.extend([ - CMakeExtension("intel_extension_for_transformers.neural_engine_py", "intel_extension_for_transformers/llm/runtime/deprecated/"), - CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.mpt_cpp", "intel_extension_for_transformers/llm/runtime/graph/"), - ]) - cmdclass={'build_ext': CMakeBuild} + ext_modules.append(CMakeExtension("intel_extension_for_transformers.llm.runtime.graph.mpt_cpp", + "intel_extension_for_transformers/llm/runtime/graph/")) + if sys.platform != "darwin": + ext_modules.append(CMakeExtension("intel_extension_for_transformers.neural_engine_py", + "intel_extension_for_transformers/llm/runtime/deprecated/")) + cmdclass = {'build_ext': CMakeBuild} setup( name="intel-extension-for-transformers",