model-latency-benchmarking Workspace

Object: BaselineHF

Object

model_name

device

llm_model

tokenizer

predict

use_torch_compile

torch_dtype

set_threads_and_interop

BaselineHF:v36

unsloth/SmolLM2-135M-Instruct-GGUF

cpu

<llama_cpp.llama.Llama object at 0x7a43205be530>

N/A

BaselineHF.predict:v13

torch.bfloat16

BaselineHF:v35

Qwen/Qwen2.5-0.5B-Instruct-GGUF

cpu

<llama_cpp.llama.Llama object at 0x7a4344df3af0>

N/A

BaselineHF.predict:v13

torch.bfloat16

BaselineHF:v34

unsloth/SmolLM2-360M-Instruct-GGUF

cpu

<llama_cpp.llama.Llama object at 0x7a43ffdafe80>

N/A

BaselineHF.predict:v13

torch.bfloat16

BaselineHF:v33

unsloth/SmolLM2-360M-Instruct-GGUF

cpu

<llama_cpp.llama.Llama object at 0x7a43453477c0>

N/A

BaselineHF.predict:v13

torch.bfloat16

BaselineHF:v32

HuggingFaceTB/SmolLM2-135M

cpu

<llama_cpp.llama.Llama object at 0x7a434c75fd90>

N/A

BaselineHF.predict:v13

torch.bfloat16

BaselineHF:v31

HuggingFaceTB/SmolLM2-135M

cpu

<llama_cpp.llama.Llama object at 0x7a43469fe350>

N/A

BaselineHF.predict:v13

torch.bfloat16

BaselineHF:v30

HuggingFaceTB/SmolLM2-135M

cpu

<llama_cpp.llama.Llama object at 0x7a434ad5ffd0>

N/A

BaselineHF.predict:v13

torch.bfloat16

BaselineHF:v29

HuggingFaceTB/SmolLM2-135M

cpu

<llama_cpp.llama.Llama object at 0x7aa0f2f6bd60>

N/A

BaselineHF.predict:v12

torch.bfloat16

BaselineHF:v28

HuggingFaceTB/SmolLM2-135M

cpu

OptimizedModule( (_orig_mod): LlamaForCausalLM( (model): LlamaModel( (embed_tokens): Embedding(49152, 576) (layers): ModuleList( (0-29): 30 x LlamaDecoderLayer( (self_attn): LlamaSdpaAttention( (q_proj): Linear(in_features=576, out_features=576, bias=False) (k_proj): Linear(in_features=576, out_features=192, bias=False) (v_proj): Linear(in_features=576, out_features=192, bias=False) (o_proj): Linear(in_features=5...

GPT2TokenizerFast(name_or_path='HuggingFaceTB/SmolLM2-135M', vocab_size=49152, model_max_length=8192, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '<reponame>', '<file_sep>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_...

BaselineHF.predict:v11

torch.bfloat16

BaselineHF:v27

HuggingFaceTB/SmolLM2-135M

cpu

BaselineHF.predict:v10

torch.bfloat16

BaselineHF:v26

HuggingFaceTB/SmolLM2-135M

cpu

LlamaForCausalLM( (model): LlamaModel( (embed_tokens): Embedding(49152, 576) (layers): ModuleList( (0-29): 30 x LlamaDecoderLayer( (self_attn): LlamaSdpaAttention( (q_proj): Linear(in_features=576, out_features=576, bias=False) (k_proj): Linear(in_features=576, out_features=192, bias=False) (v_proj): Linear(in_features=576, out_features=192, bias=False) (o_proj): Linear(in_features=576, out_features=576, bias=False) (rotar...

BaselineHF.predict:v10

torch.bfloat16

BaselineHF:v25

HuggingFaceTB/SmolLM2-135M

cpu

BaselineHF.predict:v10

torch.bfloat16

BaselineHF:v24

HuggingFaceTB/SmolLM2-135M

cpu

BaselineHF.predict:v9

torch.bfloat16