Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions apps/llm/app/voice_chat/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,11 @@ function VoiceChatScreen() {
});
recorder.start();
const transcription = await speechToText.stream();
await llm.sendMessage(transcription);
await llm.sendMessage(
typeof transcription === 'string'
? transcription
: transcription.map((w) => w.word).join(' ')
);
}
};

Expand Down Expand Up @@ -105,7 +109,13 @@ function VoiceChatScreen() {
...llm.messageHistory,
{
role: 'user',
content: speechToText.committedTranscription,
content:
typeof speechToText.committedTranscription ===
'string'
? speechToText.committedTranscription
: speechToText.committedTranscription
.map((w) => w.word)
.join(' '),
},
]
: llm.messageHistory
Expand Down
71 changes: 60 additions & 11 deletions apps/speech/screens/SpeechToTextScreen.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,14 @@ import {
TextInput,
KeyboardAvoidingView,
Platform,
Switch,
} from 'react-native';
import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
import {
useSpeechToText,
WHISPER_TINY_EN,
Word,
} from 'react-native-executorch';
import FontAwesome from '@expo/vector-icons/FontAwesome';
import {
AudioManager,
Expand All @@ -28,7 +33,10 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
model: WHISPER_TINY_EN,
});

const [transcription, setTranscription] = useState('');
const [transcription, setTranscription] = useState<string | Word[]>('');

const [enableTimestamps, setEnableTimestamps] = useState(false);

const [audioURL, setAudioURL] = useState('');
const [liveTranscribing, setLiveTranscribing] = useState(false);
const scrollViewRef = useRef<ScrollView>(null);
Expand All @@ -50,6 +58,15 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
AudioManager.requestRecordingPermissions();
}, []);

const getText = (data: string | Word[] | undefined) => {
if (!data) return '';
if (typeof data === 'string') return data;

return data
.map((w) => `${w.word} (${w.start.toFixed(2)}s - ${w.end.toFixed(2)}s)\n`)
.join('');
};

const handleTranscribeFromURL = async () => {
if (!audioURL.trim()) {
console.warn('Please provide a valid audio file URL');
Expand All @@ -66,7 +83,11 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
try {
const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
const audioBuffer = decodedAudioData.getChannelData(0);
setTranscription(await model.transcribe(audioBuffer));

const result = await model.transcribe(audioBuffer, {
enableTimestamps: enableTimestamps as any,
});
setTranscription(result);
} catch (error) {
console.error('Error decoding audio data', error);
console.warn('Note: Supported file formats: mp3, wav, flac');
Expand All @@ -76,14 +97,15 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {

const handleStartTranscribeFromMicrophone = async () => {
setLiveTranscribing(true);
setTranscription('');
setTranscription(enableTimestamps ? [] : '');

recorder.onAudioReady(({ buffer }) => {
model.streamInsert(buffer.getChannelData(0));
});
recorder.start();

try {
await model.stream();
await model.stream({ enableTimestamps: enableTimestamps });
} catch (error) {
console.error('Error during live transcription:', error);
}
Expand All @@ -106,6 +128,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
const readyToTranscribe = !model.isGenerating && model.isReady;
const recordingButtonDisabled = isSimulator || !readyToTranscribe;

const hasResult = transcription.length > 0;

const displayedText = hasResult
? getText(transcription)
: getText(model.committedTranscription) +
getText(model.nonCommittedTranscription);

return (
<SafeAreaProvider>
<SafeAreaView style={styles.container}>
Expand All @@ -126,6 +155,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
<Text>Status: {getModelStatus()}</Text>
</View>

<View style={styles.toggleContainer}>
<Text style={styles.toggleLabel}>Enable Timestamps</Text>
<Switch
value={enableTimestamps}
onValueChange={(val) => {
setEnableTimestamps(val);
setTranscription(val ? [] : '');
}}
trackColor={{ false: '#767577', true: '#0f186e' }}
thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'}
disabled={model.isGenerating}
/>
</View>

<View style={styles.transcriptionContainer}>
<Text style={styles.transcriptionLabel}>Transcription</Text>
<ScrollView
Expand All @@ -135,12 +178,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
scrollViewRef.current?.scrollToEnd({ animated: true })
}
>
<Text>
{transcription !== ''
? transcription
: model.committedTranscription +
model.nonCommittedTranscription}
</Text>
<Text>{displayedText}</Text>
</ScrollView>
</View>

Expand Down Expand Up @@ -229,6 +267,17 @@ const styles = StyleSheet.create({
marginTop: 12,
alignItems: 'center',
},
toggleContainer: {
flexDirection: 'row',
alignItems: 'center',
marginTop: 10,
marginBottom: 5,
},
toggleLabel: {
fontSize: 16,
marginRight: 10,
color: '#0f186e',
},
transcriptionContainer: {
flex: 1,
width: '100%',
Expand Down
Loading
Loading