Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions genai/api/npc_chat_api/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ data:
Your current role is: You are in an emergency supply distribution center helping to distribute food and water to the survivors in the city. It's vitally important you remain here to finish what you're working
on - you cannot leave the distribution center.

{relevant}

Talk in a casual manner as a person named Joseph. You are quite busy, so keep your answers concise.

{relevant}
'''

[[chat]]
Expand Down
3 changes: 2 additions & 1 deletion genai/api/npc_chat_api/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from fastapi import FastAPI
from pydantic import BaseModel

import json
import logging
import npc
import requests
Expand Down Expand Up @@ -86,7 +87,7 @@ def npc_chat(payload: Payload_NPC_Chat):
try:
logging.info(f'payload: {payload}')
resp = npcs[0].reply(payload.from_id, "Jane", payload.message)
logging.debug(f'resp: {resp}')
logging.debug(f'resp: {json.dumps(resp, indent=2)}')
if not payload.debug:
# Filter to just the response
resp = {"response": resp['response']}
Expand Down
20 changes: 14 additions & 6 deletions genai/api/npc_chat_api/src/npc/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,18 @@ def npcs_from_world(world, genai, db):
return [ NPC(entity, genai, db) for entity in world['base'] if entity['entity_type'] == 1 ]

class NPC(object):
_FIRST_HAND = """
_KNOWN = """
- You know the following:
{first_hand}
"""
_FIRST_HAND = """

- You've said the following to other people related to this topic:
{second_hand}
"""
_SECOND_HAND = """

- You trust the following things you've heard:
- You've heard the following from other people related to this topic, but you're not sure if you trust it:
{second_hand}
"""

Expand All @@ -43,14 +48,17 @@ def __init__(self, entity, genai, db):
self._per_chat_cost = 10 # bytes to "charge" for each chat

def _format_context(self, knowledge):
first_hand, second_hand = [], []
facts, first_hand, second_hand = [], [], []
for known in knowledge:
who, what = known['provenance'], known['knowledge']
if who:
if who == "I":
first_hand.append(f"* {who} said: {what}")
elif who:
second_hand.append(f"* {who} said: {what}")
else:
first_hand.append(f"* {what}")
relevant = self._FIRST_HAND.format(first_hand='\n'.join(first_hand)) if first_hand else ""
facts.append(f"* {what}")
relevant = self._KNOWN.format(first_hand='\n'.join(facts)) if facts else ""
relevant += self._FIRST_HAND.format(second_hand='\n'.join(first_hand)) if first_hand else ""
relevant += self._SECOND_HAND.format(second_hand='\n'.join(second_hand)) if second_hand else ""
return self._context.format(relevant=relevant)

Expand Down
7 changes: 5 additions & 2 deletions genai/api/npc_chat_api/src/npc/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class Spanner(object):

_KNOWLEDGE = """
WITH maybeRelevant AS (
-- NOTE: Dynamic knowledge may be disabled by changing @entityHistoryDynamicLimit to 0.
--
-- maybeRelevant is a union of "known facts" by @entityId, and world facts (embedded into each entityId where IsWorldData=True),
-- plus anything @entityId said or heard in chat. The `Provenance` column indicates who
-- relayed the fact, with NULL meaning "It is known".
Expand All @@ -74,7 +76,7 @@ class Spanner(object):
EventDescriptionEmbedding,
COSINE_DISTANCE(EventDescriptionEmbedding, @embedding) as Distance
FROM EntityHistoryDynamic
WHERE EntityId = @entityId
WHERE EntityId = @entityId
--- adding redundant order by entityId makes it clear we are ordered by primary key prefix rather than leaving it for the optimizer to understand the right thing to do.
--- Because it is primary key ordered, it will not scan everything in entityid then order then do limit. It should just take the most recent 16K.
ORDER BY EntityId, EventTime DESC, MessageId DESC
Expand Down Expand Up @@ -122,7 +124,8 @@ class Spanner(object):
def __init__(self, genai, gcfg, cfg):
self._get_embeddings = genai.get_embeddings
self._db = spanner.Client().instance(cfg['instance_id']).database(cfg['database_id'])
self._dynamic_knowledge_limit = 16600 # bounds on all knowledge before relevance, to bound latency
# self._dynamic_knowledge_limit = 16600 # bounds on all knowledge before relevance, to bound latency
self._dynamic_knowledge_limit = 0 # no secondhand/dynamic knowledge

# TODO: This is doing one-by-one insert into the batch, but is getting called in a loop. Be kinder?
@staticmethod
Expand Down
109 changes: 96 additions & 13 deletions genai/language/huggingface_tgi/k8s.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Huggingface TGI Deployments
#
# We create two different deployments that each match the TGI service at the bottom. If you want to play with a different
# We create four different deployments that each match the TGI service at the bottom. If you want to play with a different
# configuration (CPU vs 2xL4 x 4xL4), scale the replicas up and down.
#
apiVersion: apps/v1
Expand Down Expand Up @@ -30,7 +30,7 @@ spec:
- name: huggingface-tgi-api
ports:
- containerPort: 80
image: ghcr.io/huggingface/text-generation-inference:1.4.2
image: ghcr.io/huggingface/text-generation-inference:1.4.4
startupProbe:
httpGet:
path: /health
Expand All @@ -43,8 +43,6 @@ spec:
port: 80
failureThreshold: 12
periodSeconds: 5
# Use this image for Gemma support:
# image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-hf-tgi-serve:20240220_0936_RC01
args:
# Choose models with multiturn chat support
# Look for `chat_template`, e.g.: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json#L34
Expand All @@ -54,7 +52,6 @@ spec:
#
# To run Gemma:
# - --model-id=google/gemma-7b-it
# (but you'll need the updated image above)
#
# Another alternative, but you'll need a bunch of ephemeral storage:
- --model-id=mistralai/Mixtral-8x7B-Instruct-v0.1
Expand Down Expand Up @@ -124,7 +121,7 @@ spec:
- name: huggingface-tgi-api
ports:
- containerPort: 80
image: ghcr.io/huggingface/text-generation-inference:1.4.2
image: ghcr.io/huggingface/text-generation-inference:1.4.4
startupProbe:
httpGet:
path: /health
Expand All @@ -137,8 +134,6 @@ spec:
port: 80
failureThreshold: 12
periodSeconds: 5
# Use this image for Gemma support:
# image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-hf-tgi-serve:20240220_0936_RC01
args:
# Choose models with multiturn chat support
# Look for `chat_template`, e.g.: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json#L34
Expand All @@ -148,7 +143,6 @@ spec:
#
# To run Gemma:
# - --model-id=google/gemma-7b-it
# (but you'll need the updated image above)
#
# Another alternative, but you'll need a bunch of ephemeral storage:
- --model-id=mistralai/Mixtral-8x7B-Instruct-v0.1
Expand Down Expand Up @@ -193,6 +187,98 @@ spec:
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: huggingface-tgi-gemma-small
labels:
name: huggingface-tgi-gemma-small
spec:
replicas: 0
selector:
matchLabels:
name: huggingface-tgi-api
tgi-config: gemma-7B-2xL4
template:
metadata:
labels:
name: huggingface-tgi-api
tgi-config: gemma-7B-2xL4
spec:
serviceAccountName: k8s-sa-aiplatform
nodeSelector:
cloud.google.com/gke-accelerator: "nvidia-l4"
cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
cloud.google.com/compute-class: "Accelerator"
containers:
- name: huggingface-tgi-api
ports:
- containerPort: 80
image: ghcr.io/huggingface/text-generation-inference:1.4.4
startupProbe:
httpGet:
path: /health
port: 80
failureThreshold: 240
periodSeconds: 5
readinessProbe:
httpGet:
path: /health
port: 80
failureThreshold: 12
periodSeconds: 5
args:
# Choose models with multiturn chat support
# Look for `chat_template`, e.g.: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json#L34
# Alternatives if you don't have an API Key:
# - --model-id=mistralai/Mistral-7B-Instruct-v0.2
# - --model-id=HuggingFaceH4/zephyr-7b-beta
#
# To run Gemma:
- --model-id=google/gemma-7b-it
#
# Another alternative, but you'll need a bunch of ephemeral storage:
# - --model-id=mistralai/Mixtral-8x7B-Instruct-v0.1
# To run on 2xL4s we have to quantize even smaller
# - --quantize=bitsandbytes-nf4
# - --quantize=eetq
#
# --num-shard should match nvidia.com/gpu: limits
- --num-shard=2
#
# raise the default input/total tokens to allow for more chat context
- --max-input-length=3072
- --max-total-tokens=4096

# To use Gemma, you need to fill this in. (Preferably, use a secret or a secret manager instead.)
env:
- name: HUGGING_FACE_HUB_TOKEN
value: invalid-api-key
resources:
requests:
cpu: "10"
memory: "80Gi"
ephemeral-storage: "100Gi"
nvidia.com/gpu: 2
limits:
cpu: "20"
memory: "160Gi"
ephemeral-storage: "200Gi"
nvidia.com/gpu: 2
volumeMounts:
- mountPath: /dev/shm
name: shm
- mountPath: /data
name: data
volumes:
# # c.f. https://github.com/huggingface/text-generation-inference#a-note-on-shared-memory-shm
- name: shm
emptyDir:
medium: Memory
sizeLimit: 1Gi
- name: data
emptyDir: {}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: huggingface-tgi-mistral-cpu
labels:
Expand All @@ -218,7 +304,7 @@ spec:
- name: huggingface-tgi-api
ports:
- containerPort: 80
image: ghcr.io/huggingface/text-generation-inference:1.4.2
image: ghcr.io/huggingface/text-generation-inference:1.4.4
startupProbe:
httpGet:
path: /health
Expand All @@ -231,8 +317,6 @@ spec:
port: 80
failureThreshold: 12
periodSeconds: 5
# Use this image for Gemma support:
# image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-hf-tgi-serve:20240220_0936_RC01
args:
# Choose models with multiturn chat support
# Look for `chat_template`, e.g.: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json#L34
Expand All @@ -242,7 +326,6 @@ spec:
#
# To run Gemma:
# - --model-id=google/gemma-7b-it
# (but you'll need the updated image above)
#
# Another alternative, but you'll need a bunch of ephemeral storage:
# - --model-id=mistralai/Mixtral-8x7B-Instruct-v0.1
Expand Down