Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# ── LLM Validation ────────────────────────────────────────────────────────────
# Validates extracted conversation samples for coherence and quality before
# writing the dataset. Enabled by default when ANTHROPIC_API_KEY is set.
# Set to false to skip validation entirely (faster, no API calls).
DIALOGSMITH_LLM_VALIDATE=true

# Model used for validation scoring (defaults to claude-haiku-4-5-20251001).
# A fast, cheap model is recommended here — the validator runs once per sample.
DIALOGSMITH_LLM_MODEL=claude-haiku-4-5-20251001

# Your Anthropic API key. Required when DIALOGSMITH_LLM_VALIDATE=true.
ANTHROPIC_API_KEY=your_api_key_here
33 changes: 22 additions & 11 deletions scripts/convert_to_sharegpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,36 @@
input_path = "./data/chat_dataset.jsonl"
output_path = "./data/chat_sharegpt.json"

ROLE_MAP = {
"user": "human",
"assistant": "gpt",
}

output_data = []

with open(input_path, "r", encoding="utf-8") as infile:
for line in infile:
sample = json.loads(line)
prompt = sample.get("prompt", "").strip()
response = sample.get("response", "").strip()
turns = sample.get("conversations", [])

if not turns:
continue

conversations = []
for turn in turns:
role = ROLE_MAP.get(turn.get("role", ""), turn.get("role", ""))
text = turn.get("text", "").strip()
if role and text:
conversations.append({"from": role, "value": text})

if not prompt or not response:
continue # Skip blank entries
# Must have at least one human and one gpt turn
roles_present = {t["from"] for t in conversations}
if "human" not in roles_present or "gpt" not in roles_present:
continue

output_data.append({
"conversations": [
{"from": "user", "value": prompt},
{"from": "assistant", "value": response}
]
})
output_data.append({"conversations": conversations})

with open(output_path, "w", encoding="utf-8") as outfile:
json.dump(output_data, outfile, indent=2, ensure_ascii=False)

print(f"Converted {len(output_data)} valid samples to ShareGPT format.")
print(f"Converted {len(output_data)} valid conversation samples to ShareGPT format.")
192 changes: 133 additions & 59 deletions scripts/telegram_extract.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,163 @@
import json

# CONFIGURATION
RESULT_PATH = "./data/result.json" # Path to your exported result.json file
RESULT_PATH = "./data/result.json" # Path to your exported result.json file
OUTPUT_PATH = "./data/chat_dataset.jsonl"
TIME_GAP_THRESHOLD = 30 # Seconds between chained messages
MESSAGE_CHAIN_THRESHOLD = 30 # Max seconds between chained messages from same sender
CONVERSATION_GAP_THRESHOLD = 3600 # Seconds of silence that starts a new conversation


def get_user_name(data):
personal_info = data.get("personal_information", {})
first = personal_info.get("first_name", "")
last = personal_info.get("last_name", "")
return f"{first} {last}".strip()


def load_all_messages_from_result(data):
all_messages = []
chat_list = data.get("chats", {}).get("list", [])

for chat in chat_list:
messages = chat.get("messages", [])
if not messages:
if messages:
all_messages.append(messages)
return all_messages # List of message lists (per chat)


def get_text(msg):
"""Extract plain text from a message, handling both string and entity list formats."""
text = msg.get("text", "")
if isinstance(text, str):
return text.strip()
if isinstance(text, list):
return "".join(
t["text"] if isinstance(t, dict) else t
for t in msg.get("text_entities", text)
).strip()
return ""


def is_valid_message(msg):
return msg.get("type") == "message" and bool(get_text(msg))


def collect_turn(messages, start_idx, sender, chain_threshold):
"""
Collect all consecutive messages from `sender` starting at `start_idx`,
chaining them if within `chain_threshold` seconds of the previous message
in the chain.

Returns (texts: list[str], next_idx: int, last_unixtime: int)
"""
texts = []
last_unixtime = None
j = start_idx

while j < len(messages):
msg = messages[j]

if not is_valid_message(msg):
j += 1
continue
all_messages.append(messages)

return all_messages # List of message lists (per chat)
if msg.get("from") != sender:
break

unixtime = int(msg["date_unixtime"])

if last_unixtime is not None and (unixtime - last_unixtime) > chain_threshold:
break

texts.append(get_text(msg))
last_unixtime = unixtime
j += 1

return texts, j, last_unixtime


def split_into_conversations(messages, gap_threshold):
"""
Split a flat message list into sub-lists representing distinct conversations,
based on silence gaps between valid messages.
"""
conversations = []
current = []
last_unixtime = None

for msg in messages:
if not is_valid_message(msg):
continue

unixtime = int(msg["date_unixtime"])

if last_unixtime is not None and (unixtime - last_unixtime) > gap_threshold:
if current:
conversations.append(current)
current = []

current.append(msg)
last_unixtime = unixtime

if current:
conversations.append(current)

return conversations


def format_conversations(message_groups, your_name):
"""
For each chat, split messages into conversations, then walk each conversation
collecting alternating turns into multi-turn samples.

Each sample is a list of {"role": ..., "text": ...} dicts.
Consecutive messages from the same sender are concatenated into one turn.
"""
samples = []

for messages in message_groups:
i = 0
while i < len(messages) - 1:
msg = messages[i]

if msg.get("type") != "message" or not msg.get("text"):
i += 1
continue

sender = msg.get("from")
if sender == your_name:
i += 1
continue # Only process messages from others as prompts

# Format the prompt
if isinstance(msg["text"], str):
prompt = f"{sender}: {msg['text']}"
else:
prompt = f"{sender}: {''.join([t['text'] for t in msg.get('text_entities', [])])}"

# Gather your consecutive responses
response = []
j = i + 1
while j < len(messages):
next_msg = messages[j]
if next_msg.get("type") != "message" or not next_msg.get("text"):
j += 1
continue

if next_msg.get("from") != your_name:
break

time_diff = int(next_msg["date_unixtime"]) - int(messages[j - 1]["date_unixtime"])
if time_diff > TIME_GAP_THRESHOLD:
break

text = next_msg["text"]
if isinstance(text, str):
response.append(text)
elif isinstance(text, list):
response.append(''.join([t["text"] for t in next_msg.get("text_entities", [])]))

j += 1

if response:
samples.append({
"prompt": prompt.strip(),
"response": "\n".join(response).strip()
})

i = j # Move pointer forward
conversations = split_into_conversations(messages, CONVERSATION_GAP_THRESHOLD)

for conversation in conversations:
turns = []
i = 0

while i < len(conversation):
msg = conversation[i]
sender = msg.get("from")
role = "assistant" if sender == your_name else "user"

texts, next_i, _ = collect_turn(
conversation, i, sender, MESSAGE_CHAIN_THRESHOLD
)

if texts:
turn_text = "\n".join(texts)
# Merge with previous turn if same role (edge case: gap exceeded mid-block)
if turns and turns[-1]["role"] == role:
turns[-1]["text"] += "\n" + turn_text
Comment on lines +136 to +137
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The MESSAGE_CHAIN_THRESHOLD (currently 30s) is effectively ignored because consecutive turns with the same role are unconditionally merged here. Since collect_turn already stops when the sender changes, any consecutive calls to it within the same conversation will necessarily have the same role and thus be merged. If the goal is to strictly group all consecutive messages from the same sender into a single turn for ShareGPT compatibility, the threshold check in collect_turn (lines 68-69) is redundant. If you intended to split turns based on time, this merge logic should be removed, though that might produce non-alternating roles which are incompatible with some training formats.

else:
turns.append({"role": role, "text": turn_text})

i = next_i

# Only keep conversations that have at least one user + one assistant turn
roles = [t["role"] for t in turns]
if "user" in roles and "assistant" in roles:
samples.append(turns)

return samples


def save_dataset(samples, out_path):
with open(out_path, "w", encoding="utf-8") as f:
for sample in samples:
json.dump(sample, f, ensure_ascii=False)
json.dump({"conversations": sample}, f, ensure_ascii=False)
f.write("\n")


if __name__ == "__main__":
from validator import validate_samples

print(f"Loading {RESULT_PATH}...")
with open(RESULT_PATH, encoding="utf-8") as f:
data = json.load(f)
Expand All @@ -96,9 +167,12 @@ def save_dataset(samples, out_path):

message_groups = load_all_messages_from_result(data)

print("Formatting prompt-response pairs...")
print("Formatting multi-turn conversations...")
samples = format_conversations(message_groups, your_name)
print(f"Extracted {len(samples)} conversation samples.")

samples = validate_samples(samples)

print(f"Saving {len(samples)} samples to {OUTPUT_PATH}...")
print(f"Saving {len(samples)} conversation samples to {OUTPUT_PATH}...")
save_dataset(samples, OUTPUT_PATH)
print("Done.")
Loading