Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions src/timeline-v1.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ export interface SearchResultRaw {
result?: SearchResultRaw;
};
legacy?: LegacyTweetRaw;
article?: ArticleRaw;
}

export interface TimelineResultRaw {
Expand Down Expand Up @@ -119,6 +120,79 @@ export interface TimelineResultRaw {
};
legacy?: LegacyTweetRaw;
tweet?: TimelineResultRaw;
article?: ArticleRaw;
}

export interface ArticleRaw {
article_results: {
result: ArticleResultRaw;
};
}

export interface ArticleResultRaw {
rest_id: string;
title: string;
cover_media?: ArticleCoverMediaRaw;
content_state: ArticleContentStateRaw;
media_entities?: ArticleMediaEntityRaw[];
}

export interface ArticleCoverMediaRaw {
media_key: string;
media_info: {
original_img_url: string;
};
}

export interface ArticleContentStateRaw {
blocks: ArticleBlockRaw[];
entityMap: ArticleEntityRaw[];
}

export interface ArticleBlockRaw {
key: string;
text: string;
type: string;
inlineStyleRanges: {
offset: number;
length: number;
style: string;
}[];
entityRanges: {
key: number;
offset: number;
length: number;
}[];
}

export interface ArticleEntityValueMediaItemRaw {
localMediaId: string;
mediaCategory: string;
mediaId: string;
}

export interface ArticleEntityValueRaw {
type: string;
mutability?: string;
data: {
url?: string;
entityKey?: string;
mediaItems?: ArticleEntityValueMediaItemRaw[];
};
}

export interface ArticleEntityRaw {
key: number;
value: ArticleEntityValueRaw;
}

export interface ArticleMediaEntityRaw {
media_key: string;
media_id: string;
media_info: {
__typename: 'ApiImage' | 'ApiGif' | 'ApiVideo';
original_img_url: string;
};
}

export interface LegacyTweetRaw {
Expand Down
146 changes: 145 additions & 1 deletion src/timeline-v2.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import { CoreUserRaw, LegacyUserRaw } from './profile';
import { parseMediaGroups, reconstructTweetHtml } from './timeline-tweet-util';
import {
ArticleEntityValueMediaItemRaw,
ArticleResultRaw,
EditControlInitialRaw,
LegacyTweetRaw,
ParseTweetResult,
QueryTweetsResponse,
SearchResultRaw,
TimelineResultRaw,
} from './timeline-v1';
import { Tweet } from './tweets';
import { Article, Tweet } from './tweets';
import { isFieldDefined } from './type-util';

export interface TimelineUserResultRaw {
Expand Down Expand Up @@ -257,6 +259,139 @@ export function parseLegacyTweet(
return { success: true, tweet: tw };
}

function parseArticleToMarkdown(article: Readonly<ArticleResultRaw>): string {
const { blocks, entityMap } = article.content_state;
let markdown = `# ${article.title}\\n\\n`;

for (const block of blocks) {
let text = block.text;

const sortedEntityRanges = [...block.entityRanges].sort(
(a, b) => b.offset - a.offset,
); // Reverse order to prevent messing up the offsets
for (const range of sortedEntityRanges) {
const entityWrapper = entityMap.find(
(e) => String(e.key) === String(range.key),
);
if (!entityWrapper) continue;
const entity = entityWrapper.value;

const chars = Array.from(text);
const originalText = chars
.slice(range.offset, range.offset + range.length)
.join('');
let replacement = originalText;

let textToWrap = originalText;
let trailingNewline = '';

if (textToWrap.endsWith('\n')) {
textToWrap = textToWrap.slice(0, -1);
trailingNewline = '\n';
}

if (entity.type === 'LINK' && entity.data.url) {
replacement = `[${textToWrap}](${entity.data.url})${trailingNewline}`;
}

const prefix = chars.slice(0, range.offset).join('');
const suffix = chars.slice(range.offset + range.length).join('');
text = prefix + replacement + suffix;
}

const sortedStyleRanges = [...block.inlineStyleRanges].sort(
(a, b) => b.offset - a.offset,
);
for (const range of sortedStyleRanges) {
const chars = Array.from(text);
const originalText = chars
.slice(range.offset, range.offset + range.length)
.join('');
let replacement = originalText;

let textToWrap = originalText;
let trailingNewline = '';

if (textToWrap.endsWith('\n')) {
textToWrap = textToWrap.slice(0, -1);
trailingNewline = '\n';
}

if (range.style.toLowerCase() === 'bold') {
replacement = `**${textToWrap}**${trailingNewline}`;
} else if (range.style.toLowerCase() === 'italic') {
replacement = `*${textToWrap}*${trailingNewline}`;
}

const prefix = chars.slice(0, range.offset).join('');
const suffix = chars.slice(range.offset + range.length).join('');
text = prefix + replacement + suffix;
}

switch (block.type) {
case 'header-one':
markdown += `# ${text}\\n\\n`;
break;
case 'header-two':
markdown += `## ${text}\\n\\n`;
break;
case 'unordered-list-item':
markdown += `* ${text}\\n`;
break;
case 'atomic':
for (const range of block.entityRanges) {
const entityWrapper = entityMap.find(
(e) => String(e.key) === String(range.key),
);
if (!entityWrapper) continue;
const entity = entityWrapper.value;
if (entity?.type === 'MEDIA' && entity.data.mediaItems) {
for (const mediaItem of entity.data.mediaItems) {
if (mediaItem?.mediaId) {
const mediaEntity = article.media_entities?.find(
(m) => m.media_id === mediaItem.mediaId,
);
if (mediaEntity) {
markdown += `![image](${mediaEntity.media_info.original_img_url})\\n\\n`;
}
}
}
}
}
break;
case 'unstyled':
default:
markdown += `${text}\\n\\n`;
break;
}
}

return markdown.trim();
}

function parseArticle(articleRaw: Readonly<ArticleResultRaw>): Article {
const article: Article = {
id: articleRaw.rest_id,
title: articleRaw.title,
content_state: articleRaw.content_state,
};

if (articleRaw.cover_media) {
const coverMedia = articleRaw.media_entities?.find(
(m) => m.media_key === articleRaw.cover_media?.media_key,
);
if (coverMedia) {
article.cover = {
id: coverMedia.media_id,
url: coverMedia.media_info.original_img_url,
alt_text: undefined, // not available
};
}
}

return article;
}

function parseResult(result?: TimelineResultRaw): ParseTweetResult {
const noteTweetResultText =
result?.note_tweet?.note_tweet_results?.result?.text;
Expand All @@ -282,6 +417,15 @@ function parseResult(result?: TimelineResultRaw): ParseTweetResult {
}
}

const articleRaw = result?.article?.article_results?.result;
if (articleRaw) {
tweetResult.tweet.isArticle = true;
if (articleRaw.content_state) {
tweetResult.tweet.article = parseArticle(articleRaw);
tweetResult.tweet.text = parseArticleToMarkdown(articleRaw);
}
}

const quotedResult = result?.quoted_status_result?.result;
if (quotedResult) {
if (quotedResult.legacy && quotedResult.rest_id) {
Expand Down
30 changes: 30 additions & 0 deletions src/tweets.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -378,3 +378,33 @@ test('scraper can get animated image as video', async () => {
url: expectedURL,
});
});

test('scraper marks article tweets and exposes article metadata', async () => {
// X Article tweet — `legacy.full_text` is just the t.co URL stub; the
// body lives in `article.article_results.result`. Without article
// parsing the lib used to return `text` as the bare URL and lose the
// body entirely.
const scraper = await getScraper();
const tweet = await scraper.getTweet('2053808119709659225');

expect(tweet).not.toBeNull();
expect(tweet?.isArticle).toBe(true);
expect(tweet?.article).toBeDefined();
expect(tweet?.article?.id).toBe('2051886859186532352');
expect(tweet?.article?.title).toContain('Research Layer');
// content_state is the source-of-truth payload we render markdown from.
expect(tweet?.article?.content_state?.blocks?.length ?? 0).toBeGreaterThan(
10,
);
});

test('scraper renders article body into tweet.text as markdown', async () => {
const scraper = await getScraper();
const tweet = await scraper.getTweet('2053808119709659225');

expect(tweet?.text).toBeDefined();
// Far longer than the bare-URL stub the lib used to return (~23 chars)
// and the title is rendered as an H1 at the top.
expect((tweet?.text ?? '').length).toBeGreaterThan(1000);
expect(tweet?.text).toMatch(/^# /);
});
15 changes: 14 additions & 1 deletion src/tweets.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import { addApiFeatures, requestApi, bearerToken2 } from './api';
import { TwitterAuth } from './auth';
import { getUserIdByScreenName } from './profile';
import { LegacyTweetRaw, QueryTweetsResponse } from './timeline-v1';
import {
ArticleContentStateRaw,
LegacyTweetRaw,
QueryTweetsResponse,
} from './timeline-v1';
import {
parseTimelineTweetsV2,
TimelineV2,
Expand Down Expand Up @@ -33,6 +37,13 @@ export interface Video {
url?: string;
}

export interface Article {
id: string;
title: string;
cover?: Photo;
content_state: ArticleContentStateRaw;
}

export interface PlaceRaw {
id?: string;
place_type?: string;
Expand Down Expand Up @@ -65,6 +76,8 @@ export interface Tweet {
isReply?: boolean;
isRetweet?: boolean;
isSelfThread?: boolean;
isArticle?: boolean;
article?: Article;
likes?: number;
name?: string;
mentions: Mention[];
Expand Down