cactus-compute · sidcraftscode · Jul 9, 2025 · Jul 11, 2025 · Jul 12, 2025 · Jul 12, 2025
diff --git a/app/index.tsx b/app/index.tsx
@@ -14,14 +14,15 @@ import { logChatCompletionDiagnostics } from '@/services/diagnostics';
 import { MessageInput } from '@/components/ui/chat/MessageInput';
 import { Model } from '@/services/models';
 import { VoiceModeOverlay } from '@/components/VoiceModeScreen';
-import { streamLlamaCompletion, generateUniqueId } from '@/services/chat/llama-local';
+import { streamLlamaCompletion, generateUniqueId, loadConversationIntoContext, initializeConversationContext } from '@/services/chat/llama-local';
 
 
 export default function ChatScreen() {
   const [messages, setMessages] = useState<Message[]>([]);
   const [currentAIMessage, setCurrentAIMessage] = useState<string>('');
   const [isStreaming, setIsStreaming] = useState(false);
   const [voiceMode, setVoiceMode] = useState(false);
+  const [isConversationLoaded, setIsConversationLoaded] = useState(false);
 
   const scrollViewRef = useRef<any>(null);
   const { 
@@ -47,6 +48,7 @@ export default function ChatScreen() {
         setMessages([]);
         saveCurrentConversation([]);
       }
+      setIsConversationLoaded(true);
     };
 
     loadConversationState();
@@ -126,7 +128,7 @@ export default function ChatScreen() {
   }
 
   const sendMessage = async (inputText: string) => {
-    if (!inputText.trim() || isStreaming) return;
+    if (!inputText.trim() || isStreaming || !isConversationLoaded) return;
     if (!selectedModel) return;
 
     const userMessage: Message = createUserMessage(inputText, selectedModel);
@@ -137,7 +139,8 @@ export default function ChatScreen() {
 
     setIsStreaming(true);
     try {
-      // await sendChatMessage(
+      // In v0.1.4, pass only the updated messages array which includes the new user message
+      // The context will handle the latest message from this array
       await streamLlamaCompletion(
         cactusContext.context,
         updatedMessages,
@@ -147,6 +150,7 @@ export default function ChatScreen() {
         true,
         tokenGenerationLimit,
         isReasoningEnabled,
+        voiceMode
       );      
     } catch (error) {
       console.error('Error in chat:', error);

diff --git a/components/VoiceModeScreen.tsx b/components/VoiceModeScreen.tsx
@@ -49,7 +49,7 @@ export const VoiceModeOverlay = ({
       setMessages(updatedMessages);
       await streamLlamaCompletion(
         cactusContext.context,
-        updatedMessages,
+        updatedMessages, // Pass the full array, context will handle the latest message
         currentModel,
         setAiMessageText,
         (metrics, model, completeMessage) => {
@@ -62,7 +62,7 @@ export const VoiceModeOverlay = ({
         true
       );
     }
-  }, [messages, selectedModelRef, tokenGenerationLimit])//, setMessages, setAiMessageText, setIsProcessing]);
+  }, [messages, selectedModelRef, tokenGenerationLimit, cactusContext.context])
 
   // Called when speech recognition encounters an error
   const onSpeechError = useCallback((e: any) => {
@@ -186,7 +186,9 @@ export const VoiceModeOverlay = ({
         {isListening && <Text textAlign="center">Listening...</Text>}
       </YStack>
       <YStack position='absolute' bottom='10%' width='80%' gap="$2">
-        <Text fontSize={12} textAlign='center' color="$gray10">This voice experience is currently powered by Apple Speech and not by Cactus Voice</Text>
+        <Text fontSize={12} textAlign='center' color="$gray10">
+          This voice experience is currently powered by Apple Speech and not by Cactus Voice
+        </Text>
         {errorMessage && <Text color="$red10">Error: {errorMessage}</Text>}
       </YStack>
     </YStack>

diff --git a/components/ui/chat/MessageInput.tsx b/components/ui/chat/MessageInput.tsx
@@ -85,7 +85,11 @@ function MessageInputComponent({ sendMessage, isStreaming, selectedModel, setVoi
 
     const handlePause = () => {
         console.log('pause!')
-        cactusContext.context?.stopCompletion();
+        // In v0.1.4, the context object may have different method signatures
+        // Make sure stopCompletion exists before calling it
+        if (cactusContext.context?.stopCompletion) {
+            cactusContext.context.stopCompletion();
+        }
     }
 
     return (

diff --git a/contexts/modelContext.tsx b/contexts/modelContext.tsx
@@ -133,6 +133,8 @@ export const ModelProvider = ({ children }: { children: React.ReactNode }) => {
           n_ctx: 2048,
           n_gpu_layers: gpuLayers
         });
+        // In v0.1.4, context maintains its own message history
+        // Context starts fresh when initialized, so no need to explicitly rewind here
         const endTime = performance.now();
         logModelLoadDiagnostics({model: selectedModel.value, loadTime: endTime - startTime});
         setCactusContext({

diff --git a/package.json b/package.json
@@ -32,7 +32,7 @@
     "@tamagui/shorthands": "^1.125.33",
     "@tamagui/themes": "^1.125.33",
     "@tamagui/web": "^1.125.33",
-    "cactus-react-native": "^0.0.1",
+    "cactus-react-native": "^0.1.4",
     "expo": "~52.0.43",
     "expo-blur": "~14.0.3",
     "expo-constants": "~17.0.8",

diff --git a/services/chat/llama-local.ts b/services/chat/llama-local.ts
@@ -12,6 +12,65 @@ export interface ChatCompleteCallback {
   (metrics: ModelMetrics, model: Model, completeMessage: string): void;
 }
 
+/**
+ * Initialize the conversation context with system message for v0.1.4
+ */
+export async function initializeConversationContext(
+  context: LlamaContext,
+  voiceMode?: boolean
+): Promise<void> {
+  const systemMessage = {
+    role: 'system',
+    content: `You are Cactus, a very capable AI assistant running offline on a smartphone. ${voiceMode ? 'Keep your messages VERY short. One-two sentences max.' : ''}`
+  };
+
+  // Initialize context with system message
+  await context.completion({
+    messages: [systemMessage],
+    n_predict: 1, // Minimal prediction just to set the system message
+    stop: ['</s>']
+  });
+}
+
+/**
+ * Load existing conversation into context for v0.1.4
+ */
+export async function loadConversationIntoContext(
+  context: LlamaContext,
+  messages: Message[],
+  voiceMode?: boolean
+): Promise<void> {
+  if (messages.length === 0) {
+    // If no messages, just initialize with system message
+    await initializeConversationContext(context, voiceMode);
+    return;
+  }
+
+  if (context.stopCompletion) await context.stopCompletion();
+  await Promise.race([context.rewind(), new Promise((_, reject) => setTimeout(reject, 3000))]);
+
+  // Format all messages including system message
+  const systemMessage = {
+    role: 'system',
+    content: `You are Cactus, a very capable AI assistant running offline on a smartphone. ${voiceMode ? 'Keep your messages VERY short. One-two sentences max.' : ''}`
+  };
+
+  const formattedMessages = [
+    systemMessage,
+    ...messages.map(msg => ({
+      role: msg.isUser ? 'user' : 'assistant',
+      content: msg.text
+    }))
+  ];
+
+  // Load all messages in one call to restore conversation context
+  await context.completion({
+    messages: formattedMessages,
+    n_predict: 1, // Minimal prediction just to load the context
+    stop: ['</s>']
+  });
+}
+
 export async function streamLlamaCompletion(
   context: LlamaContext | null,
   messages: Message[],
@@ -21,7 +80,8 @@ export async function streamLlamaCompletion(
   streaming: boolean = true,
   maxTokens: number,
   isReasoningEnabled: boolean,
-  voiceMode?: boolean
+  voiceMode?: boolean,
+  isFirstMessage?: boolean
 ) {
   try {
     console.log('Ensuring Llama context...', new Date().toISOString());
@@ -34,16 +94,13 @@ export async function streamLlamaCompletion(
                        '<|im_end|>', '<|EOT|>', '<|END_OF_TURN_TOKEN|>', 
                        '<|end_of_turn|>', '<|endoftext|>', '<end_of_turn>', '<|end_of_sentence|>'];
 
-    const formattedMessages = [
-      {
-        role: 'system',
-        content: `You are Cactus, a very capable AI assistant running offline on a smartphone. ${voiceMode ? 'Keep your messages VERY short. One-two sentences max.' : ''}`
-      },
-      ...messages.map(msg => ({
-        role: msg.isUser ? 'user' : 'assistant',
-        content: msg.text
-      }))
-    ];
+    // In v0.1.4, the context maintains its own message history
+    // So we only need to pass the latest message
+    const latestMessage = messages[messages.length - 1];
+    const formattedLatestMessage = {
+      role: latestMessage.isUser ? 'user' : 'assistant',
+      content: latestMessage.text
+    };
 
     const startTime = performance.now();
     let firstTokenTime: number | null = null;
@@ -58,7 +115,7 @@ export async function streamLlamaCompletion(
     if (streaming) {
       const result = await context.completion(
         {
-          messages: formattedMessages,
+          messages: [formattedLatestMessage], // Only pass the latest message
           n_predict: maxTokens,
           stop: stopWords,
         },
@@ -82,7 +139,7 @@ export async function streamLlamaCompletion(
       onComplete(modelMetrics, model, responseText);
     } else {
       const result = await context.completion({
-        messages: formattedMessages,
+        messages: [formattedLatestMessage],
         n_predict: 1024,
         stop: stopWords,
       });