patternrustMinor
Enum as state for log parsing
Viewed 0 times
logenumparsingforstate
Problem
I have a small project to help me analyze query latency from PostgreSQL statement logging.
It's my first time writing anything real in Rust and I think it could be improved in several ways. My question here is about this code from
```
use std::hash::{Hash, SipHasher, Hasher};
use regex::Regex;
use std::collections::HashMap;
use std::string::String;
use std;
use csv::Writer;
pub enum CrunchState {
Scanning(HashMap, Writer),
CurrentQuery(Vec, i32, HashMap, Writer)
}
enum MatchResult {
Ignore,
QueryStart(i32, String),
Duration(i32, String)
}
lazy_static! {
static ref REGLS: Regex = Regex::new(r"^2016").unwrap();
static ref REPID: Regex = Regex::new(r"\d{2,3}\((\d+)\):").unwrap();
static ref REDURATION: Regex = Regex::new(r"duration: ([0-9.]+) ms").unwrap();
static ref RESTATEMENT: Regex = Regex::new(r"(?:execute.|statement):(.)").unwrap();
}
pub fn init_state() -> CrunchState {
let csv_writer: Writer = Writer::from_writer(std::io::stdout());
CrunchState::Scanning(HashMap::new(), csv_writer)
}
pub fn process_line(line:String, state:CrunchState) -> CrunchState {
match state {
CrunchState::Scanning(mut pid_to_query, mut csv_writer) => {
match analyze_line(line) {
MatchResult::Ignore => CrunchState::Scanning(pid_to_query, csv_writer),
MatchResult::QueryStart(pid, query_begin) => {
let query_parts = vec![query_begin];
CrunchState::CurrentQuery(query_parts, pid, pid_to_query, csv_writer)
},
MatchResult::Duration(pid, duration) => {
match pid_to_query.remove(&pid) {
Some(full_query) => {
let mut hasher = SipHasher::new();
full_query.hash(&mut hasher);
let qhash = hasher.finish();
let result = csv_writer.encode((pid, duration, qhash
It's my first time writing anything real in Rust and I think it could be improved in several ways. My question here is about this code from
scanner.rs:```
use std::hash::{Hash, SipHasher, Hasher};
use regex::Regex;
use std::collections::HashMap;
use std::string::String;
use std;
use csv::Writer;
pub enum CrunchState {
Scanning(HashMap, Writer),
CurrentQuery(Vec, i32, HashMap, Writer)
}
enum MatchResult {
Ignore,
QueryStart(i32, String),
Duration(i32, String)
}
lazy_static! {
static ref REGLS: Regex = Regex::new(r"^2016").unwrap();
static ref REPID: Regex = Regex::new(r"\d{2,3}\((\d+)\):").unwrap();
static ref REDURATION: Regex = Regex::new(r"duration: ([0-9.]+) ms").unwrap();
static ref RESTATEMENT: Regex = Regex::new(r"(?:execute.|statement):(.)").unwrap();
}
pub fn init_state() -> CrunchState {
let csv_writer: Writer = Writer::from_writer(std::io::stdout());
CrunchState::Scanning(HashMap::new(), csv_writer)
}
pub fn process_line(line:String, state:CrunchState) -> CrunchState {
match state {
CrunchState::Scanning(mut pid_to_query, mut csv_writer) => {
match analyze_line(line) {
MatchResult::Ignore => CrunchState::Scanning(pid_to_query, csv_writer),
MatchResult::QueryStart(pid, query_begin) => {
let query_parts = vec![query_begin];
CrunchState::CurrentQuery(query_parts, pid, pid_to_query, csv_writer)
},
MatchResult::Duration(pid, duration) => {
match pid_to_query.remove(&pid) {
Some(full_query) => {
let mut hasher = SipHasher::new();
full_query.hash(&mut hasher);
let qhash = hasher.finish();
let result = csv_writer.encode((pid, duration, qhash
Solution
-
The compiler warnings tell me that
-
Check out rustfmt. The code has issues with missing spaces after
-
Use
main.rs
scanner.rs
-
Don't use
-
You can create methods on enums just like on structs.
-
Accept a
-
Create a tiny helper method for getting the hash.
-
Consider glob-importing your enum into methods that deal with them heavily.
-
Use
-
Use
-
Avoid Hungarian notation (where you encode the type of something into the name of the variable). The regexen don't need to be prefixed with
-
Don't provide explicit types unless you are required.
-
There's no need for the turbofish when the type is constrained by the struct you are putting the value in.
-
Instead of
-
Try to avoid doing multiple regex calls for the same input. For example, calling
```
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::io::{self, Stdout};
use regex::Regex;
use csv::Writer;
pub enum CrunchState {
Scanning(HashMap, Writer),
CurrentQuery(Vec, i32, HashMap, Writer),
}
fn one_shot_hash(full_query: &str) -> u64 {
let mut hasher = DefaultHasher::new();
full_query.hash(&mut hasher);
hasher.finish()
}
impl CrunchState {
pub fn new() -> CrunchState {
let csv_writer = Writer::from_writer(io::stdout());
CrunchState::Scanning(HashMap::new(), csv_writer)
}
pub fn process_line(self, line: String) -> CrunchState {
use self::CrunchState::*;
use self::MatchResult::*;
match self {
Scanning(mut pid_to_query, mut csv_writer) => {
match analyze_line(&line) {
Ignore => Scanning(pid_to_query, csv_writer),
QueryStart(pid, query_begin) => {
let query_parts = vec![query_begin];
CurrentQuery(query_parts, pid, pid_to_query, csv_writer)
}
Duration(pid, duration) => {
if let Some(full_query) = pid_to_query.remove(&pid) {
let query_hash = one_shot_hash(&full_query);
let result = csv_writer.encode((pid, duration, query_hash, &full_query));
result.expect("Unable to write result");
}
Scanning(pid_to_query, csv_writer)
}
}
}
CurrentQuery(mut query_parts, pid, mut pid_to_query, csv_writer) => {
if !GLS.is_match(&line) {
query_parts.push(line);
CurrentQuery(query_parts, pid, pid_to_query, csv_writer)
} else {
let full_query = query_parts.into_iter().collect();
pid_to_query.insert(pid, full_query);
Scanning(pid_to_query, csv_writer).process_line(line)
}
}
}
}
}
lazy_static! {
static ref GLS: Regex = Regex::new(r"^2016").unwrap();
static ref PID: Regex = Regex::new(r"\d{2,3}\((\d+)\):").unwrap();
static ref DURATION: Regex = Regex::new(r"duration: ([0-9.]+) ms").unwrap();
static ref STATEMENT: Regex = Regex::new(r"(?:execute.|statement):(.)").unwrap();
}
enum MatchResult {
Ignore,
QueryStart(i32, String),
Duration(i32, String),
}
fn analyze_line(line: &str) -> MatchResult {
use self::MatchResult::*;
if GLS.is_match(&line) {
match PID.captures_iter(&line).nth(0) {
Some(cap) => {
let pid = cap.at(1).unwrap();
if DURATION.is_match(&line) {
let duration = DURATION.captures_iter(&line).nth(0).unwrap().at(1).unwrap();
Duration(pid.parse().unwrap(), duration.to_string())
The compiler warnings tell me that
SipHasher has been deprecated; use DefaultHasher instead.-
Check out rustfmt. The code has issues with missing spaces after
: and ,.-
Use
lines on stdin instead of reimplementing it.main.rs
extern crate pg_crunch;
use std::io;
use std::io::prelude::*;
use pg_crunch::scanner::CrunchState;
fn main() {
let mut state = CrunchState::new();
let stdin = io::stdin();
for line in stdin.lock().lines() {
match line {
Ok(line) => state = state.process_line(line),
Err(error) => println!("error: {}", error),
}
}
}scanner.rs
-
Don't use
String; it's already imported. I'd probably prefer to import specific modules or types instead of referring to std.-
You can create methods on enums just like on structs.
new and process_line really feel like methods to me.-
Accept a
&str instead of a String unless you make use of the allocation. analyze_line is a good example.-
Create a tiny helper method for getting the hash.
-
Consider glob-importing your enum into methods that deal with them heavily.
-
Use
if let when there's only one interesting match arm.-
Use
collect to combine multiple strings from an iterator.-
Avoid Hungarian notation (where you encode the type of something into the name of the variable). The regexen don't need to be prefixed with
RE.-
Don't provide explicit types unless you are required.
: Writer is a good example.-
There's no need for the turbofish when the type is constrained by the struct you are putting the value in.
-
Instead of
assert!, call expect on the Result. This prints the error message and lets you add a bit more context.-
Try to avoid doing multiple regex calls for the same input. For example, calling
is_match is redundant if you are also going to call captures_iter. You should be able to tell if it matched by the result of captures_iter.```
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::io::{self, Stdout};
use regex::Regex;
use csv::Writer;
pub enum CrunchState {
Scanning(HashMap, Writer),
CurrentQuery(Vec, i32, HashMap, Writer),
}
fn one_shot_hash(full_query: &str) -> u64 {
let mut hasher = DefaultHasher::new();
full_query.hash(&mut hasher);
hasher.finish()
}
impl CrunchState {
pub fn new() -> CrunchState {
let csv_writer = Writer::from_writer(io::stdout());
CrunchState::Scanning(HashMap::new(), csv_writer)
}
pub fn process_line(self, line: String) -> CrunchState {
use self::CrunchState::*;
use self::MatchResult::*;
match self {
Scanning(mut pid_to_query, mut csv_writer) => {
match analyze_line(&line) {
Ignore => Scanning(pid_to_query, csv_writer),
QueryStart(pid, query_begin) => {
let query_parts = vec![query_begin];
CurrentQuery(query_parts, pid, pid_to_query, csv_writer)
}
Duration(pid, duration) => {
if let Some(full_query) = pid_to_query.remove(&pid) {
let query_hash = one_shot_hash(&full_query);
let result = csv_writer.encode((pid, duration, query_hash, &full_query));
result.expect("Unable to write result");
}
Scanning(pid_to_query, csv_writer)
}
}
}
CurrentQuery(mut query_parts, pid, mut pid_to_query, csv_writer) => {
if !GLS.is_match(&line) {
query_parts.push(line);
CurrentQuery(query_parts, pid, pid_to_query, csv_writer)
} else {
let full_query = query_parts.into_iter().collect();
pid_to_query.insert(pid, full_query);
Scanning(pid_to_query, csv_writer).process_line(line)
}
}
}
}
}
lazy_static! {
static ref GLS: Regex = Regex::new(r"^2016").unwrap();
static ref PID: Regex = Regex::new(r"\d{2,3}\((\d+)\):").unwrap();
static ref DURATION: Regex = Regex::new(r"duration: ([0-9.]+) ms").unwrap();
static ref STATEMENT: Regex = Regex::new(r"(?:execute.|statement):(.)").unwrap();
}
enum MatchResult {
Ignore,
QueryStart(i32, String),
Duration(i32, String),
}
fn analyze_line(line: &str) -> MatchResult {
use self::MatchResult::*;
if GLS.is_match(&line) {
match PID.captures_iter(&line).nth(0) {
Some(cap) => {
let pid = cap.at(1).unwrap();
if DURATION.is_match(&line) {
let duration = DURATION.captures_iter(&line).nth(0).unwrap().at(1).unwrap();
Duration(pid.parse().unwrap(), duration.to_string())
Code Snippets
extern crate pg_crunch;
use std::io;
use std::io::prelude::*;
use pg_crunch::scanner::CrunchState;
fn main() {
let mut state = CrunchState::new();
let stdin = io::stdin();
for line in stdin.lock().lines() {
match line {
Ok(line) => state = state.process_line(line),
Err(error) => println!("error: {}", error),
}
}
}use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::io::{self, Stdout};
use regex::Regex;
use csv::Writer;
pub enum CrunchState {
Scanning(HashMap<i32, String>, Writer<Stdout>),
CurrentQuery(Vec<String>, i32, HashMap<i32, String>, Writer<Stdout>),
}
fn one_shot_hash(full_query: &str) -> u64 {
let mut hasher = DefaultHasher::new();
full_query.hash(&mut hasher);
hasher.finish()
}
impl CrunchState {
pub fn new() -> CrunchState {
let csv_writer = Writer::from_writer(io::stdout());
CrunchState::Scanning(HashMap::new(), csv_writer)
}
pub fn process_line(self, line: String) -> CrunchState {
use self::CrunchState::*;
use self::MatchResult::*;
match self {
Scanning(mut pid_to_query, mut csv_writer) => {
match analyze_line(&line) {
Ignore => Scanning(pid_to_query, csv_writer),
QueryStart(pid, query_begin) => {
let query_parts = vec![query_begin];
CurrentQuery(query_parts, pid, pid_to_query, csv_writer)
}
Duration(pid, duration) => {
if let Some(full_query) = pid_to_query.remove(&pid) {
let query_hash = one_shot_hash(&full_query);
let result = csv_writer.encode((pid, duration, query_hash, &full_query));
result.expect("Unable to write result");
}
Scanning(pid_to_query, csv_writer)
}
}
}
CurrentQuery(mut query_parts, pid, mut pid_to_query, csv_writer) => {
if !GLS.is_match(&line) {
query_parts.push(line);
CurrentQuery(query_parts, pid, pid_to_query, csv_writer)
} else {
let full_query = query_parts.into_iter().collect();
pid_to_query.insert(pid, full_query);
Scanning(pid_to_query, csv_writer).process_line(line)
}
}
}
}
}
lazy_static! {
static ref GLS: Regex = Regex::new(r"^2016").unwrap();
static ref PID: Regex = Regex::new(r"\d{2,3}\((\d+)\):").unwrap();
static ref DURATION: Regex = Regex::new(r"duration: ([0-9.]+) ms").unwrap();
static ref STATEMENT: Regex = Regex::new(r"(?:execute.*|statement):(.*)").unwrap();
}
enum MatchResult {
Ignore,
QueryStart(i32, String),
Duration(i32, String),
}
fn analyze_line(line: &str) -> MatchResult {
use self::MatchResult::*;
if GLS.is_match(&line) {
match PID.captures_iter(&line).nth(0) {
Some(cap) => {
let pid = cap.at(1).unwrap();
if DURATION.is_match(&line) {
let duration = DURATION.captures_iter(&line).nth(0).unwrap().at(1).unwrap();
Context
StackExchange Code Review Q#146671, answer score: 2
Revisions (0)
No revisions yet.