Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions src/__tests__/azure-mstts-namespace.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -149,5 +149,40 @@ describe("Azure MSTTS Namespace Handling", () => {
expect(result).toMatch(/<voice[^>]*>\s*<prosody[^>]*>/);
expect(result).toMatch(/<\/prosody>\s*<\/voice>/);
});

it("should produce a single <prosody> element (not double-nested) when options are provided", async () => {
// Regression test for: https://github.com/willwade/js-tts-wrapper/issues/40
// this.properties defaults (rate="medium", pitch="medium", volume=100) were always
// truthy, causing a first prosody to be added, then options adding a second one on top.
const plainSSML = `<speak>Hello world</speak>`;
const options = { rate: "fast", pitch: "high", volume: 80 };

const result = (client as any).ensureAzureSSMLStructure(plainSSML, "en-US-JennyNeural", options);

const prosodyMatches = result.match(/<prosody/g);
expect(prosodyMatches?.length).toBe(1);
});

it("should not emit <prosody> when all values are at Azure defaults", async () => {
// No prosody element needed when everything is at the implicit default
const plainSSML = `<speak>Hello world</speak>`;
const options = { rate: "medium", pitch: "medium", volume: 100 };

const result = (client as any).ensureAzureSSMLStructure(plainSSML, "en-US-JennyNeural", options);

expect(result).not.toContain("<prosody");
});

it("should normalise 0-1 volume fraction to 0-100 percentage", async () => {
// Regression test for: https://github.com/willwade/js-tts-wrapper/issues/40
// Callers commonly pass volume as a 0-1 float; 0.8 should become volume="80%", not "0.8%".
const plainSSML = `<speak>Hello world</speak>`;
const options = { volume: 0.8 };

const result = (client as any).ensureAzureSSMLStructure(plainSSML, "en-US-JennyNeural", options);

expect(result).toContain('volume="80%"');
expect(result).not.toContain('volume="0.8%"');
});
});
});
64 changes: 34 additions & 30 deletions src/engines/azure.ts
Original file line number Diff line number Diff line change
Expand Up @@ -616,38 +616,42 @@ export class AzureTTSClient extends AbstractTTSClient {
}
}

// Add prosody if properties are set
if (this.properties.rate || this.properties.pitch || this.properties.volume) {
// Extract content between voice tags or speak tags
let content = "";
if (ssml.includes("<voice")) {
const match = ssml.match(/<voice[^>]*>(.*?)<\/voice>/s);
if (match) {
content = match[1];
const prosodyContent = this.constructProsodyTag(content);
ssml = ssml.replace(content, prosodyContent);
}
} else {
const match = ssml.match(/<speak[^>]*>(.*?)<\/speak>/s);
if (match) {
content = match[1];
const prosodyContent = this.constructProsodyTag(content);
ssml = ssml.replace(content, prosodyContent);
}
// Build prosody attributes by merging this.properties defaults with per-call options.
// Options take precedence. We only emit a <prosody> element when at least one
// attribute differs from Azure's implicit defaults (medium/medium/100%), to avoid
// wrapping content in a no-op element.
{
const DEFAULT_RATE = "medium";
const DEFAULT_PITCH = "medium";
const DEFAULT_VOLUME = 100;

const rate = options?.rate ?? (this.properties.rate as string | undefined);
const pitch = options?.pitch ?? (this.properties.pitch as string | undefined);
// volume: SpeakOptions types volume as 0-100. Guard against callers who pass a
// 0-1 fraction by normalising: any value ≤ 1 (and > 0) is treated as a fraction
// and scaled to 0-100.
let rawVolume: number | undefined =
options?.volume !== undefined
? options.volume
: (this.properties.volume as number | undefined);
if (rawVolume !== undefined && rawVolume > 0 && rawVolume <= 1) {
rawVolume = Math.round(rawVolume * 100);
}
}
const volume = rawVolume !== undefined ? rawVolume : DEFAULT_VOLUME;

const hasNonDefaultProsody =
(rate !== undefined && rate !== DEFAULT_RATE) ||
(pitch !== undefined && pitch !== DEFAULT_PITCH) ||
volume !== DEFAULT_VOLUME;

if (hasNonDefaultProsody) {
const attrs: string[] = [];
if (rate && rate !== DEFAULT_RATE) attrs.push(`rate="${rate}"`);
if (pitch && pitch !== DEFAULT_PITCH) attrs.push(`pitch="${pitch}"`);
if (volume !== DEFAULT_VOLUME) attrs.push(`volume="${volume}%"`);

// Also add prosody from options if provided
if (options?.rate || options?.pitch || options?.volume !== undefined) {
// Create prosody attributes
const attrs: string[] = [];
if (options.rate) attrs.push(`rate="${options.rate}"`);
if (options.pitch) attrs.push(`pitch="${options.pitch}"`);
if (options.volume !== undefined) attrs.push(`volume="${options.volume}%"`);

if (attrs.length > 0) {
// Extract content from inside <voice> if present, otherwise from <speak>.
// Prosody must be nested inside <voice>, not as a direct child of <speak>.
// <prosody> must be nested inside <voice>, not as a direct child of <speak>.
// Azure rejects: Node [speak] should not contain node [prosody] with type [Others].
if (ssml.includes("<voice")) {
const match = ssml.match(/<voice[^>]*>(.*?)<\/voice>/s);
if (match) {
Expand Down
Loading