Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Linkify
=======

Linkify is a Rust library to find links such as URLs and email addresses in
Linkify is a Rust library to find links such as URLs, email addresses, and bug references in
plain text. It's smart about where a link ends, such as with trailing
punctuation.

Expand Down Expand Up @@ -29,7 +29,7 @@ Seems simple enough. But then we also have these cases:
This library behaves as you'd expect in the above cases and many more.
It uses a simple scan with linear runtime.

In addition to URLs, it can also find email addresses.
In addition to URLs, it can also find email addresses and bug references like `#12345`.

## Demo 🧑‍🔬

Expand Down Expand Up @@ -62,6 +62,7 @@ assert_eq!(1, links.len());
let link = &links[0];

assert_eq!("http://example.com", link.as_str());
assert_eq!("http://example.com", link.href());
assert_eq!(14, link.start());
assert_eq!(32, link.end());
assert_eq!(&LinkKind::Url, link.kind());
Expand Down Expand Up @@ -98,6 +99,24 @@ assert_eq!("foo@example.com", link.as_str());
assert_eq!(&LinkKind::Email, link.kind());
```

Configure a bug reference prefix:

```rust
use linkify::{LinkFinder, LinkKind};

let input = "Fixed in #12345";
let mut finder = LinkFinder::new();
finder.kinds(&[LinkKind::BugReference]);
finder.bug_reference_prefix("https://example.org/bugs/");
let links: Vec<_> = finder.links(input).collect();

assert_eq!(1, links.len());
let link = &links[0];
assert_eq!("#12345", link.as_str());
assert_eq!("https://example.org/bugs/12345", link.href());
assert_eq!(&LinkKind::BugReference, link.kind());
```

See full documentation on [docs.rs](https://docs.rs/linkify).

## Conformance
Expand Down
51 changes: 51 additions & 0 deletions src/bug.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use std::ops::Range;

use crate::scanner::Scanner;

/// Scan for bug references such as `#12345`.
pub struct BugReferenceScanner;

impl Scanner for BugReferenceScanner {
fn scan(&self, s: &str, hash: usize) -> Option<Range<usize>> {
if !self.find_start(&s[..hash]) {
return None;
}

let after_hash = hash + 1;
let digits = s[after_hash..]
.bytes()
.take_while(|byte| byte.is_ascii_digit())
.count();

if digits == 0 {
return None;
}

let end = after_hash + digits;
if !self.find_end(&s[end..]) {
return None;
}

Some(Range { start: hash, end })
}
}

impl BugReferenceScanner {
fn find_start(&self, s: &str) -> bool {
match s.chars().next_back() {
Some(c) => !Self::identifier_char(c) && c != '#',
None => true,
}
}

fn find_end(&self, s: &str) -> bool {
match s.chars().next() {
Some(c) => !Self::identifier_char(c),
None => true,
}
}

fn identifier_char(c: char) -> bool {
c.is_alphanumeric() || c == '_'
}
}
75 changes: 64 additions & 11 deletions src/finder.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use std::fmt;
use std::iter::Peekable;

use memchr::{memchr, memchr2, memchr3};

use crate::bug::BugReferenceScanner;
use crate::email::EmailScanner;
use crate::scanner::Scanner;
use crate::url::{DomainScanner, UrlScanner};
Expand All @@ -13,6 +12,7 @@ pub struct Link<'t> {
text: &'t str,
start: usize,
end: usize,
href: Option<String>,
kind: LinkKind,
}

Expand All @@ -35,6 +35,16 @@ impl<'t> Link<'t> {
&self.text[self.start..self.end]
}

/// Get the link destination.
///
/// For most links this is the same as `as_str()`. Bug references return the
/// configured prefix plus the bug number when a bug reference prefix was set
/// on the `LinkFinder`.
#[inline]
pub fn href(&self) -> &str {
self.href.as_deref().unwrap_or_else(|| self.as_str())
}

/// The type of the link.
#[inline]
pub fn kind(&self) -> &LinkKind {
Expand All @@ -50,6 +60,8 @@ pub enum LinkKind {
Url,
/// E-mail links like "foo@example.org"
Email,
/// Bug references like "#12345".
BugReference,
}

/// Span within the input text.
Expand Down Expand Up @@ -95,6 +107,8 @@ impl<'t> Span<'t> {
/// A configured link finder.
#[derive(Debug)]
pub struct LinkFinder {
bug_reference: bool,
bug_reference_prefix: Option<String>,
email: bool,
email_domain_must_have_dot: bool,
url: bool,
Expand All @@ -108,8 +122,10 @@ type TriggerFinder = dyn Fn(&[u8]) -> Option<usize>;
pub struct Links<'t> {
text: &'t str,
rewind: usize,
bug_reference_prefix: Option<String>,

trigger_finder: Box<TriggerFinder>,
bug_reference_scanner: BugReferenceScanner,
email_scanner: EmailScanner,
url_scanner: UrlScanner,
domain_scanner: DomainScanner,
Expand All @@ -129,6 +145,8 @@ impl LinkFinder {
/// If you only want to find a certain kind of links, use the `kinds` method.
pub fn new() -> LinkFinder {
LinkFinder {
bug_reference: true,
bug_reference_prefix: None,
email: true,
email_domain_must_have_dot: true,
url: true,
Expand Down Expand Up @@ -163,12 +181,24 @@ impl LinkFinder {
self
}

/// Set a prefix used to rewrite bug reference links.
///
/// For example, with a prefix of `https://example.org/bugs/`, a matched
/// bug reference `#12345` will keep `as_str()` as `#12345` and return
/// `https://example.org/bugs/12345` from `href()`.
pub fn bug_reference_prefix(&mut self, prefix: &str) -> &mut LinkFinder {
self.bug_reference_prefix = Some(prefix.to_owned());
self
}

/// Restrict the kinds of links that should be found to the specified ones.
pub fn kinds(&mut self, kinds: &[LinkKind]) -> &mut LinkFinder {
self.bug_reference = false;
self.email = false;
self.url = false;
for kind in kinds {
match *kind {
LinkKind::BugReference => self.bug_reference = true,
LinkKind::Email => self.email = true,
LinkKind::Url => self.url = true,
}
Expand All @@ -182,6 +212,8 @@ impl LinkFinder {
pub fn links<'t>(&self, text: &'t str) -> Links<'t> {
Links::new(
text,
self.bug_reference,
self.bug_reference_prefix.clone(),
self.url,
self.url_must_have_scheme,
self.email,
Expand Down Expand Up @@ -217,12 +249,15 @@ impl Default for LinkFinder {
impl<'t> Links<'t> {
fn new(
text: &'t str,
bug_reference: bool,
bug_reference_prefix: Option<String>,
url: bool,
url_must_have_scheme: bool,
email: bool,
email_domain_must_have_dot: bool,
iri_parsing_enabled: bool,
) -> Links<'t> {
let bug_reference_scanner = BugReferenceScanner;
let url_scanner = UrlScanner {
iri_parsing_enabled,
};
Expand All @@ -233,19 +268,28 @@ impl<'t> Links<'t> {
domain_must_have_dot: email_domain_must_have_dot,
};

// With optional schemes URLs don't have unique `:`, then search for `.` as well
let trigger_finder: Box<TriggerFinder> = match (url, email) {
(true, true) if url_must_have_scheme => Box::new(|s| memchr2(b':', b'@', s)),
(true, true) => Box::new(|s| memchr3(b':', b'@', b'.', s)),
(true, false) if url_must_have_scheme => Box::new(|s| memchr(b':', s)),
(true, false) => Box::new(|s| memchr2(b':', b'.', s)),
(false, true) => Box::new(|s| memchr(b'@', s)),
(false, false) => Box::new(|_| None),
};
// With optional schemes URLs don't have unique `:`, then search for `.` as well.
let mut triggers = Vec::new();
if url {
triggers.push(b':');
if !url_must_have_scheme {
triggers.push(b'.');
}
}
if email {
triggers.push(b'@');
}
if bug_reference {
triggers.push(b'#');
}
let trigger_finder: Box<TriggerFinder> =
Box::new(move |s| s.iter().position(|byte| triggers.contains(byte)));
Links {
text,
rewind: 0,
bug_reference_prefix,
trigger_finder,
bug_reference_scanner,
email_scanner,
url_scanner,
domain_scanner,
Expand All @@ -263,6 +307,7 @@ impl<'t> Iterator for Links<'t> {
while let Some(i) = (self.trigger_finder)(slice[find_from..].as_bytes()) {
let trigger = slice.as_bytes()[find_from + i];
let (scanner, kind): (&dyn Scanner, LinkKind) = match trigger {
b'#' => (&self.bug_reference_scanner, LinkKind::BugReference),
b':' => (&self.url_scanner, LinkKind::Url),
b'.' => (&self.domain_scanner, LinkKind::Url),
b'@' => (&self.email_scanner, LinkKind::Email),
Expand All @@ -272,10 +317,18 @@ impl<'t> Iterator for Links<'t> {
let start = self.rewind + range.start;
let end = self.rewind + range.end;
self.rewind = end;
let href = if kind == LinkKind::BugReference {
self.bug_reference_prefix
.as_ref()
.map(|prefix| format!("{}{}", prefix, &self.text[start + 1..end]))
} else {
None
};
let link = Link {
text: self.text,
start,
end,
href,
kind,
};
return Some(link);
Expand Down
24 changes: 22 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! Linkify finds links such as URLs and email addresses in plain text.
//! Linkify finds links such as URLs, email addresses, and bug references in plain text.
//! It's smart about where a link ends, such as with trailing punctuation.
//!
//! Your reaction might be: "Do I need a library for this? Why not a regex?".
Expand All @@ -17,7 +17,7 @@
//! This library behaves as you'd expect in the above cases and many more.
//! It uses a simple scan with linear runtime.
//!
//! In addition to URLs, it can also find emails.
//! In addition to URLs, it can also find emails and bug references like `#12345`.
//!
//! ### Usage
//!
Expand All @@ -34,6 +34,7 @@
//! let link = &links[0];
//!
//! assert_eq!("http://example.com", link.as_str());
//! assert_eq!("http://example.com", link.href());
//! assert_eq!(14, link.start());
//! assert_eq!(32, link.end());
//! assert_eq!(&LinkKind::Url, link.kind());
Expand Down Expand Up @@ -70,6 +71,24 @@
//! assert_eq!(&LinkKind::Email, link.kind());
//! ```
//!
//! Configure a bug reference prefix:
//!
//! ```
//! use linkify::{LinkFinder, LinkKind};
//!
//! let input = "Fixed in #12345";
//! let mut finder = LinkFinder::new();
//! finder.kinds(&[LinkKind::BugReference]);
//! finder.bug_reference_prefix("https://example.org/bugs/");
//! let links: Vec<_> = finder.links(input).collect();
//!
//! assert_eq!(1, links.len());
//! let link = &links[0];
//! assert_eq!("#12345", link.as_str());
//! assert_eq!("https://example.org/bugs/12345", link.href());
//! assert_eq!(&LinkKind::BugReference, link.kind());
//! ```
//!
//! Split the text into consecutive spans (mixed links and plain text).
//!
//! ```
Expand Down Expand Up @@ -119,6 +138,7 @@
#![deny(missing_docs)]
#![deny(missing_debug_implementations)]

mod bug;
mod domains;
mod email;
mod finder;
Expand Down
Loading