@@ -335,8 +335,12 @@ struct server_slot {
335335};
336336
337337struct server_metrics {
338+ const int64_t t_start = ggml_time_us();
339+
338340 uint64_t n_prompt_tokens_processed_total = 0 ;
341+ uint64_t t_prompt_processing_total = 0 ;
339342 uint64_t n_tokens_predicted_total = 0 ;
343+ uint64_t t_tokens_generation_total = 0 ;
340344
341345 uint64_t n_prompt_tokens_processed = 0 ;
342346 uint64_t t_prompt_processing = 0 ;
@@ -348,12 +352,14 @@ struct server_metrics {
348352 n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed ;
349353 n_prompt_tokens_processed += slot.n_prompt_tokens_processed ;
350354 t_prompt_processing += slot.t_prompt_processing ;
355+ t_prompt_processing_total += slot.t_prompt_processing ;
351356 }
352357
353358 void on_prediction (const server_slot &slot) {
354- n_tokens_predicted_total += slot.n_decoded ;
355- n_tokens_predicted += slot.n_decoded ;
356- t_tokens_generation += slot.t_token_generation ;
359+ n_tokens_predicted_total += slot.n_decoded ;
360+ n_tokens_predicted += slot.n_decoded ;
361+ t_tokens_generation += slot.t_token_generation ;
362+ t_tokens_generation_total += slot.t_token_generation ;
357363 }
358364
359365 void reset_bucket () {
@@ -1502,9 +1508,12 @@ struct server_context {
15021508 { " idle" , n_idle_slots },
15031509 { " processing" , n_processing_slots },
15041510 { " deferred" , queue_tasks.queue_tasks_deferred .size () },
1511+ { " t_start" , metrics.t_start },
15051512
15061513 { " n_prompt_tokens_processed_total" , metrics.n_prompt_tokens_processed_total },
1514+ { " t_tokens_generation_total" , metrics.t_tokens_generation_total },
15071515 { " n_tokens_predicted_total" , metrics.n_tokens_predicted_total },
1516+ { " t_prompt_processing_total" , metrics.t_prompt_processing_total },
15081517
15091518 { " n_prompt_tokens_processed" , metrics.n_prompt_tokens_processed },
15101519 { " t_prompt_processing" , metrics.t_prompt_processing },
@@ -1517,7 +1526,9 @@ struct server_context {
15171526 { " slots" , slots_data },
15181527 };
15191528
1520- metrics.reset_bucket ();
1529+ if (json_value (task.data , " reset_bucket" , false )) {
1530+ metrics.reset_bucket ();
1531+ }
15211532 queue_results.send (res);
15221533 } break ;
15231534 }
@@ -2709,6 +2720,7 @@ int main(int argc, char ** argv) {
27092720 task.id_multi = -1 ;
27102721 task.id_target = -1 ;
27112722 task.type = SERVER_TASK_TYPE_METRICS;
2723+ task.data .push_back ({{" reset_bucket" , true }});
27122724
27132725 ctx_server.queue_results .add_waiting_task_id (task.id );
27142726 ctx_server.queue_tasks .post (task);
@@ -2732,36 +2744,44 @@ int main(int argc, char ** argv) {
27322744 {" counter" , {{
27332745 {" name" , " prompt_tokens_total" },
27342746 {" help" , " Number of prompt tokens processed." },
2735- {" value" , data[" n_prompt_tokens_processed_total" ]}
2747+ {" value" , (uint64_t ) data[" n_prompt_tokens_processed_total" ]}
2748+ }, {
2749+ {" name" , " prompt_seconds_total" },
2750+ {" help" , " Prompt process time" },
2751+ {" value" , (uint64_t ) data[" t_prompt_processing_total" ] / 1 .e3 }
27362752 }, {
27372753 {" name" , " tokens_predicted_total" },
27382754 {" help" , " Number of generation tokens processed." },
2739- {" value" , data[" n_tokens_predicted_total" ]}
2755+ {" value" , (uint64_t ) data[" n_tokens_predicted_total" ]}
2756+ }, {
2757+ {" name" , " tokens_predicted_seconds_total" },
2758+ {" help" , " Predict process time" },
2759+ {" value" , (uint64_t ) data[" t_tokens_generation_total" ] / 1 .e3 }
27402760 }}},
27412761 {" gauge" , {{
27422762 {" name" , " prompt_tokens_seconds" },
27432763 {" help" , " Average prompt throughput in tokens/s." },
2744- {" value" , n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0 }
2764+ {" value" , n_prompt_tokens_processed ? 1 . e3 / t_prompt_processing * n_prompt_tokens_processed : 0 . }
27452765 },{
27462766 {" name" , " predicted_tokens_seconds" },
27472767 {" help" , " Average generation throughput in tokens/s." },
2748- {" value" , n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0 }
2768+ {" value" , n_tokens_predicted ? 1 . e3 / t_tokens_generation * n_tokens_predicted : 0 . }
27492769 },{
27502770 {" name" , " kv_cache_usage_ratio" },
27512771 {" help" , " KV-cache usage. 1 means 100 percent usage." },
27522772 {" value" , 1 . * kv_cache_used_cells / params.n_ctx }
27532773 },{
27542774 {" name" , " kv_cache_tokens" },
27552775 {" help" , " KV-cache tokens." },
2756- {" value" , data[" kv_cache_tokens_count" ]}
2776+ {" value" , ( uint64_t ) data[" kv_cache_tokens_count" ]}
27572777 },{
27582778 {" name" , " requests_processing" },
27592779 {" help" , " Number of request processing." },
2760- {" value" , data[" processing" ]}
2780+ {" value" , ( uint64_t ) data[" processing" ]}
27612781 },{
27622782 {" name" , " requests_deferred" },
27632783 {" help" , " Number of request deferred." },
2764- {" value" , data[" deferred" ]}
2784+ {" value" , ( uint64_t ) data[" deferred" ]}
27652785 }}}
27662786 };
27672787
@@ -2775,13 +2795,16 @@ int main(int argc, char ** argv) {
27752795 const std::string name = metric_def[" name" ];
27762796 const std::string help = metric_def[" help" ];
27772797
2778- auto value = json_value (metric_def, " value" , 0 );
2798+ auto value = json_value (metric_def, " value" , 0 . );
27792799 prometheus << " # HELP llamacpp:" << name << " " << help << " \n "
27802800 << " # TYPE llamacpp:" << name << " " << type << " \n "
27812801 << " llamacpp:" << name << " " << value << " \n " ;
27822802 }
27832803 }
27842804
2805+ const int64_t t_start = data[" t_start" ];
2806+ res.set_header (" Process-Start-Time-Unix" , std::to_string (t_start));
2807+
27852808 res.set_content (prometheus.str (), " text/plain; version=0.0.4" );
27862809 res.status = 200 ; // HTTP OK
27872810 });
0 commit comments