Skip to content

Commit 930edb6

Browse files
authored
Merge pull request #7 from MLSys-UCSD/hao_ai_lab_updates
Hao ai lab project updates: ltr
2 parents 24ead8c + c1e5ef5 commit 930edb6

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

data/projectsData.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ interface Project {
66
}
77

88
const projectsData: Project[] = [
9+
{
10+
title: 'Efficient LLM Scheduling by Learning to Rank',
11+
description: `Traditional Large Language Model (LLM) serving systems use first-come-first-serve (FCFS) scheduling, leading to delays when longer requests block shorter ones. We introduced a learning-to-rank method to predict output length rankings, enabling a Shortest Job First-like policy and reducing chatbot latency by 6.9x under high load compared to FCFS.`,
12+
imgSrc: '/static/images/projects/llm-ltr-cover.jpg',
13+
href: 'https://hao-ai-lab.github.io/blogs/vllm-ltr',
14+
},
915
{
1016
title: "Can Scheduling Overhead Dominate LLM Inference Performance? A Study of CPU Scheduling Overhead on Two Popular LLM Inference Systems",
1117
description: "CPU scheduling overhead can dominate LLM inference time—up to 50% in systems like vLLM! Scheduling overhead can no longer be ignored as model forwarding speeds increase and more scheduling tasks get added.",
@@ -41,13 +47,7 @@ const projectsData: Project[] = [
4147
description: `DistServe is goodput-optmized LLM serving system that supports prefill-decode disaggregation, a.k.a. splitting prefill from decode into different GPUs, to account for both cost and user satisfaction. DistServe achieves up to 4.48x goodput or 10.2x tighter SLO compared to exiting state-of-the-art serving systems, while staying within tight latency constraints.`,
4248
imgSrc: '/static/images/projects/distserve_anime-crop.gif',
4349
href: 'https://hao-ai-lab.github.io/blogs/distserve',
44-
},
45-
{
46-
title: 'Efficient LLM Scheduling by Learning to Rank',
47-
description: `Traditional Large Language Model (LLM) serving systems use first-come-first-serve (FCFS) scheduling, leading to delays when longer requests block shorter ones. The unpredictability of LLM workloads and output lengths further complicates scheduling. We introduced a learning-to-rank method to predict output length rankings, enabling a Shortest Job First-like policy and reducing chatbot latency by 6.9x under high load compared to FCFS.`,
48-
imgSrc: '/static/images/projects/llm-ltr-cover.jpg',
49-
href: 'https://hao-ai-lab.github.io/blogs/vllm-ltr',
50-
},
50+
}
5151
]
5252

5353
export default projectsData

0 commit comments

Comments
 (0)