patternrustMinor
Article date extractor
Viewed 0 times
extractorarticledate
Problem
I am quite new to Rust and this is my first library written in it. It's an article date extractor heavily inspired by the original Python library as well as its Haskell port.
It is fairly small and most of the logic was taken from the Python library. I want to make sure my Rust code is idiomatic.
This is extract_date.rs:
```
use regex::Regex;
use chrono::NaiveDate;
use reqwest;
use std::io::Read;
use select::document::Document;
use select::predicate::{Name, Attr};
use rustc_serialize::json::Json;
use errors::*;
// Some formats borrowed from https://github.com/amir/article-date-extractor
static FMTS: &'static [&str] = &["%A, %B %e, %Y",
"%Y-%m-%dT%H:%M:%S%:z",
"/%Y/%m/%d/",
"/%Y/%d/%m/",
"%Y-%m-%d",
"%B %e, %Y",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%dT%H:%M:%SZ",
"%B %k, %Y, %H:%M %p",
"%Y-%m-%d %H:%M:%S.000000"];
// Use lazy_static to ensure we only compile the regex once
lazy_static! {
// Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py
static ref RE: Regex =
Regex::new(r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})").unwrap();
}
// Parse the date, trying out each format
fn parse_date(input: &str) -> Result {
let mut result: Result = Err("None of the formats matched the date".into());
'outer: for fmt in FMTS {
if let Ok(v) = NaiveDate::parse_from_str(input, fmt) {
{
result = Ok(v);
break 'outer;
}
}
}
result
}
// Extract date from a URL
fn extract_from_url(url: &str) -> Option {
if let Some(val) = RE.find(url) {
return Some(val.as_str().to_string()
It is fairly small and most of the logic was taken from the Python library. I want to make sure my Rust code is idiomatic.
This is extract_date.rs:
```
use regex::Regex;
use chrono::NaiveDate;
use reqwest;
use std::io::Read;
use select::document::Document;
use select::predicate::{Name, Attr};
use rustc_serialize::json::Json;
use errors::*;
// Some formats borrowed from https://github.com/amir/article-date-extractor
static FMTS: &'static [&str] = &["%A, %B %e, %Y",
"%Y-%m-%dT%H:%M:%S%:z",
"/%Y/%m/%d/",
"/%Y/%d/%m/",
"%Y-%m-%d",
"%B %e, %Y",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%dT%H:%M:%SZ",
"%B %k, %Y, %H:%M %p",
"%Y-%m-%d %H:%M:%S.000000"];
// Use lazy_static to ensure we only compile the regex once
lazy_static! {
// Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py
static ref RE: Regex =
Regex::new(r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})").unwrap();
}
// Parse the date, trying out each format
fn parse_date(input: &str) -> Result {
let mut result: Result = Err("None of the formats matched the date".into());
'outer: for fmt in FMTS {
if let Ok(v) = NaiveDate::parse_from_str(input, fmt) {
{
result = Ok(v);
break 'outer;
}
}
}
result
}
// Extract date from a URL
fn extract_from_url(url: &str) -> Option {
if let Some(val) = RE.find(url) {
return Some(val.as_str().to_string()
Solution
Type deduction
First of all, there is no need to specify types everywhere. The Rust compiler supports type deduction, so
is same as
Use type deduction whenever it is possible. DRY. BTW, I'd recommend to avoid names that start with underscores: readability suffers.
Statements vs expressions
Almost everything in Rust is an expression. Prefer expressions to statements. It allows to simplify code and avoid unnecessary initializations.
vs
This approach also allows to make more values immutable which in most cases leads to more effective code.
Branching
Rust standard library provides functions with which unnecessary branching in the code can be avoided. Let's combine this tip with the previous ones and see what happens:
=>
That's how idiomatic Rust code looks like!
I've also deleted code that downloads web page, because there is no need for it. This move frees library from network code and allows users use their favorite libraries for http requests (e.g. hyper).
First of all, there is no need to specify types everywhere. The Rust compiler supports type deduction, so
let mut _ldjson: String = String::new();is same as
let mut _ldjson = String::new();Use type deduction whenever it is possible. DRY. BTW, I'd recommend to avoid names that start with underscores: readability suffers.
Statements vs expressions
Almost everything in Rust is an expression. Prefer expressions to statements. It allows to simplify code and avoid unnecessary initializations.
let mut _decoded_ldjson: Json = Json::from_str("{}").unwrap();
match Json::from_str(&_ldjson) {
Ok(v) => _decoded_ldjson = v,
_ => return None,
}vs
let decoded_ldjson = match Json::from_str(&ldjson) {
Ok(v) => v,
_ => return None;
};This approach also allows to make more values immutable which in most cases leads to more effective code.
Branching
Rust standard library provides functions with which unnecessary branching in the code can be avoided. Let's combine this tip with the previous ones and see what happens:
pub fn extract_article_published_date(link: &str, html: Option) -> Result {
let mut body: String = String::new();
let mut _parsed_body: Option = None;
if let Some(v) = extract_from_url(link) {
return parse_date(&v);
}
if html.is_none() {
if let Ok(mut response) = reqwest::get(link) {
response.read_to_string(&mut body).unwrap();
let doc = Document::from(body.as_str());
_parsed_body = Some(doc);
} else {
return Err("Couldn't open the link".into());
}
} else {
_parsed_body = Some(Document::from(html.unwrap().as_str()))
}
if let Some(v) = extract_from_url(link) {
return parse_date(&v);
} else if let Some(v) = extract_from_ldjson(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else if let Some(v) = extract_from_meta(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else if let Some(v) = extract_from_html_tag(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else {
return Err("Couldn't find the date to parse".into());
}
}=>
pub fn extract_article_published_date(link: &str, html: &str) -> Result {
let doc = Document::from(html);
extract_from_url(link)
.or_else(|| extract_from_ldjson(doc.as_ref()))
.or_else(|| extract_from_meta(doc.as_ref()))
.or_else(|| extract_from_html_tag(doc.as_ref()))
.ok_or("Couldn't find the date to parse".into())
.map(|v| parse_date(&v))
}That's how idiomatic Rust code looks like!
I've also deleted code that downloads web page, because there is no need for it. This move frees library from network code and allows users use their favorite libraries for http requests (e.g. hyper).
Code Snippets
let mut _ldjson: String = String::new();let mut _ldjson = String::new();let mut _decoded_ldjson: Json = Json::from_str("{}").unwrap();
match Json::from_str(&_ldjson) {
Ok(v) => _decoded_ldjson = v,
_ => return None,
}let decoded_ldjson = match Json::from_str(&ldjson) {
Ok(v) => v,
_ => return None;
};pub fn extract_article_published_date(link: &str, html: Option<String>) -> Result<NaiveDate> {
let mut body: String = String::new();
let mut _parsed_body: Option<Document> = None;
if let Some(v) = extract_from_url(link) {
return parse_date(&v);
}
if html.is_none() {
if let Ok(mut response) = reqwest::get(link) {
response.read_to_string(&mut body).unwrap();
let doc = Document::from(body.as_str());
_parsed_body = Some(doc);
} else {
return Err("Couldn't open the link".into());
}
} else {
_parsed_body = Some(Document::from(html.unwrap().as_str()))
}
if let Some(v) = extract_from_url(link) {
return parse_date(&v);
} else if let Some(v) = extract_from_ldjson(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else if let Some(v) = extract_from_meta(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else if let Some(v) = extract_from_html_tag(_parsed_body.as_ref().unwrap()) {
return parse_date(&v);
} else {
return Err("Couldn't find the date to parse".into());
}
}Context
StackExchange Code Review Q#158516, answer score: 5
Revisions (0)
No revisions yet.