googleforgames · zmerlynn · Mar 26, 2024
diff --git a/genai/api/npc_chat_api/config.yaml b/genai/api/npc_chat_api/config.yaml
@@ -120,9 +120,9 @@ data:
     Your current role is: You are in an emergency supply distribution center helping to distribute food and water to the survivors in the city. It's vitally important you remain here to finish what you're working
     on - you cannot leave the distribution center.
 
-    {relevant}
-
     Talk in a casual manner as a person named Joseph. You are quite busy, so keep your answers concise.
+
+    {relevant}
     '''
 
     [[chat]]

diff --git a/genai/api/npc_chat_api/src/main.py b/genai/api/npc_chat_api/src/main.py
@@ -15,6 +15,7 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 
+import json
 import logging
 import npc
 import requests
@@ -86,7 +87,7 @@ def npc_chat(payload: Payload_NPC_Chat):
     try:
         logging.info(f'payload: {payload}')
         resp = npcs[0].reply(payload.from_id, "Jane", payload.message)
-        logging.debug(f'resp: {resp}')
+        logging.debug(f'resp: {json.dumps(resp, indent=2)}')
         if not payload.debug:
             # Filter to just the response
             resp = {"response": resp['response']}

diff --git a/genai/api/npc_chat_api/src/npc/chat.py b/genai/api/npc_chat_api/src/npc/chat.py
@@ -18,13 +18,18 @@ def npcs_from_world(world, genai, db):
     return [ NPC(entity, genai, db) for entity in world['base'] if entity['entity_type'] == 1 ]
 
 class NPC(object):
-    _FIRST_HAND = """
+    _KNOWN = """
 - You know the following:
 {first_hand}
+"""
+    _FIRST_HAND = """
+
+- You've said the following to other people related to this topic:
+{second_hand}
 """
     _SECOND_HAND = """
 
-- You trust the following things you've heard:
+- You've heard the following from other people related to this topic, but you're not sure if you trust it:
 {second_hand}
 """
 
@@ -43,14 +48,17 @@ def __init__(self, entity, genai, db):
         self._per_chat_cost = 10 # bytes to "charge" for each chat
 
     def _format_context(self, knowledge):
-        first_hand, second_hand = [], []
+        facts, first_hand, second_hand = [], [], []
         for known in knowledge:
             who, what = known['provenance'], known['knowledge']
-            if who:
+            if who == "I":
+                first_hand.append(f"* {who} said: {what}")
+            elif who:
                 second_hand.append(f"* {who} said: {what}")
             else:
-                first_hand.append(f"* {what}")
-        relevant = self._FIRST_HAND.format(first_hand='\n'.join(first_hand)) if first_hand else ""
+                facts.append(f"* {what}")
+        relevant = self._KNOWN.format(first_hand='\n'.join(facts)) if facts else ""
+        relevant += self._FIRST_HAND.format(second_hand='\n'.join(first_hand)) if first_hand else ""
         relevant += self._SECOND_HAND.format(second_hand='\n'.join(second_hand)) if second_hand else ""
         return self._context.format(relevant=relevant)
 

diff --git a/genai/api/npc_chat_api/src/npc/db.py b/genai/api/npc_chat_api/src/npc/db.py
@@ -57,6 +57,8 @@ class Spanner(object):
 
     _KNOWLEDGE = """
 WITH maybeRelevant AS (
+        -- NOTE: Dynamic knowledge may be disabled by changing @entityHistoryDynamicLimit to 0.
+        --
         -- maybeRelevant is a union of "known facts" by @entityId, and world facts (embedded into each entityId where IsWorldData=True),
         -- plus anything @entityId said or heard in chat. The `Provenance` column indicates who
         -- relayed the fact, with NULL meaning "It is known".
@@ -74,7 +76,7 @@ class Spanner(object):
             EventDescriptionEmbedding,
             COSINE_DISTANCE(EventDescriptionEmbedding, @embedding) as Distance
         FROM EntityHistoryDynamic
-        WHERE EntityId = @entityId 
+        WHERE EntityId = @entityId
         --- adding redundant order by entityId makes it clear we are ordered by primary key prefix rather than leaving it for the optimizer to understand the right thing to do.
         --- Because it is primary key ordered, it will not scan everything in entityid then order then do limit. It should just take the most recent 16K.
         ORDER BY EntityId, EventTime DESC, MessageId DESC
@@ -122,7 +124,8 @@ class Spanner(object):
     def __init__(self, genai, gcfg, cfg):
         self._get_embeddings = genai.get_embeddings
         self._db = spanner.Client().instance(cfg['instance_id']).database(cfg['database_id'])
-        self._dynamic_knowledge_limit = 16600  # bounds on all knowledge before relevance, to bound latency
+        # self._dynamic_knowledge_limit = 16600  # bounds on all knowledge before relevance, to bound latency
+        self._dynamic_knowledge_limit = 0 # no secondhand/dynamic knowledge
 
     # TODO: This is doing one-by-one insert into the batch, but is getting called in a loop. Be kinder?
     @staticmethod

diff --git a/genai/language/huggingface_tgi/k8s.yaml b/genai/language/huggingface_tgi/k8s.yaml
@@ -1,6 +1,6 @@
 # Huggingface TGI Deployments
 #
-# We create two different deployments that each match the TGI service at the bottom. If you want to play with a different
+# We create four different deployments that each match the TGI service at the bottom. If you want to play with a different
 # configuration (CPU vs 2xL4 x 4xL4), scale the replicas up and down.
 #
 apiVersion: apps/v1
@@ -30,7 +30,7 @@ spec:
         - name: huggingface-tgi-api
           ports:
             - containerPort: 80
-          image: ghcr.io/huggingface/text-generation-inference:1.4.2
+          image: ghcr.io/huggingface/text-generation-inference:1.4.4
           startupProbe:
             httpGet:
               path: /health
@@ -43,8 +43,6 @@ spec:
               port: 80
             failureThreshold: 12
             periodSeconds: 5
-          # Use this image for Gemma support:
-          # image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-hf-tgi-serve:20240220_0936_RC01
           args:
             # Choose models with multiturn chat support
             # Look for `chat_template`, e.g.: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json#L34
@@ -54,7 +52,6 @@ spec:
             #
             # To run Gemma:
             # - --model-id=google/gemma-7b-it
-            # (but you'll need the updated image above)
             #
             # Another alternative, but you'll need a bunch of ephemeral storage:
             - --model-id=mistralai/Mixtral-8x7B-Instruct-v0.1
@@ -124,7 +121,7 @@ spec:
         - name: huggingface-tgi-api
           ports:
             - containerPort: 80
-          image: ghcr.io/huggingface/text-generation-inference:1.4.2
+          image: ghcr.io/huggingface/text-generation-inference:1.4.4
           startupProbe:
             httpGet:
               path: /health
@@ -137,8 +134,6 @@ spec:
               port: 80
             failureThreshold: 12
             periodSeconds: 5
-          # Use this image for Gemma support:
-          # image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-hf-tgi-serve:20240220_0936_RC01
           args:
             # Choose models with multiturn chat support
             # Look for `chat_template`, e.g.: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json#L34
@@ -148,7 +143,6 @@ spec:
             #
             # To run Gemma:
             # - --model-id=google/gemma-7b-it
-            # (but you'll need the updated image above)
             #
             # Another alternative, but you'll need a bunch of ephemeral storage:
             - --model-id=mistralai/Mixtral-8x7B-Instruct-v0.1
@@ -193,6 +187,98 @@ spec:
 ---
 apiVersion: apps/v1
 kind: Deployment
+metadata:
+  name: huggingface-tgi-gemma-small
+  labels:
+    name: huggingface-tgi-gemma-small
+spec:
+  replicas: 0
+  selector:
+    matchLabels:
+      name: huggingface-tgi-api
+      tgi-config: gemma-7B-2xL4
+  template:
+    metadata:
+      labels:
+        name: huggingface-tgi-api
+        tgi-config: gemma-7B-2xL4
+    spec:
+      serviceAccountName: k8s-sa-aiplatform
+      nodeSelector:
+        cloud.google.com/gke-accelerator: "nvidia-l4"
+        cloud.google.com/gke-ephemeral-storage-local-ssd: "true"
+        cloud.google.com/compute-class: "Accelerator"
+      containers:
+        - name: huggingface-tgi-api
+          ports:
+            - containerPort: 80
+          image: ghcr.io/huggingface/text-generation-inference:1.4.4
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 80
+            failureThreshold: 240
+            periodSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 80
+            failureThreshold: 12
+            periodSeconds: 5
+          args:
+            # Choose models with multiturn chat support
+            # Look for `chat_template`, e.g.: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json#L34
+            # Alternatives if you don't have an API Key:
+            # - --model-id=mistralai/Mistral-7B-Instruct-v0.2
+            # - --model-id=HuggingFaceH4/zephyr-7b-beta
+            #
+            # To run Gemma:
+            - --model-id=google/gemma-7b-it
+            #
+            # Another alternative, but you'll need a bunch of ephemeral storage:
+            # - --model-id=mistralai/Mixtral-8x7B-Instruct-v0.1
+            # To run on 2xL4s we have to quantize even smaller
+            # - --quantize=bitsandbytes-nf4
+            # - --quantize=eetq
+            #
+            # --num-shard should match nvidia.com/gpu: limits
+            - --num-shard=2
+            #
+            # raise the default input/total tokens to allow for more chat context
+            - --max-input-length=3072
+            - --max-total-tokens=4096
+
+          # To use Gemma, you need to fill this in. (Preferably, use a secret or a secret manager instead.)
+          env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            value: invalid-api-key
+          resources:
+            requests:
+              cpu: "10"
+              memory: "80Gi"
+              ephemeral-storage: "100Gi"
+              nvidia.com/gpu: 2
+            limits:
+              cpu: "20"
+              memory: "160Gi"
+              ephemeral-storage: "200Gi"
+              nvidia.com/gpu: 2
+          volumeMounts:
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /data
+              name: data
+      volumes:
+        # # c.f. https://github.com/huggingface/text-generation-inference#a-note-on-shared-memory-shm
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
+        - name: data
+          emptyDir: {}
+---
+apiVersion: apps/v1
+kind: Deployment
 metadata:
   name: huggingface-tgi-mistral-cpu
   labels:
@@ -218,7 +304,7 @@ spec:
         - name: huggingface-tgi-api
           ports:
             - containerPort: 80
-          image: ghcr.io/huggingface/text-generation-inference:1.4.2
+          image: ghcr.io/huggingface/text-generation-inference:1.4.4
           startupProbe:
             httpGet:
               path: /health
@@ -231,8 +317,6 @@ spec:
               port: 80
             failureThreshold: 12
             periodSeconds: 5
-          # Use this image for Gemma support:
-          # image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-hf-tgi-serve:20240220_0936_RC01
           args:
             # Choose models with multiturn chat support
             # Look for `chat_template`, e.g.: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json#L34
@@ -242,7 +326,6 @@ spec:
             #
             # To run Gemma:
             # - --model-id=google/gemma-7b-it
-            # (but you'll need the updated image above)
             #
             # Another alternative, but you'll need a bunch of ephemeral storage:
             # - --model-id=mistralai/Mixtral-8x7B-Instruct-v0.1