From: Ralf Jung Date: Mon, 13 Jul 2015 15:19:57 +0000 (+0200) Subject: part 13 draft: sorting, external dependencies X-Git-Url: https://git.ralfj.de/rust-101.git/commitdiff_plain/bae9e47884fdc3fc1a81fb4844572a832fcfb2ce?ds=sidebyside;hp=5f6e02d64e3789115ea4327a045b8ad3c39b1808 part 13 draft: sorting, external dependencies --- diff --git a/Cargo.lock b/Cargo.lock index b44a287..67ade17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,4 +1,63 @@ [root] name = "rust-101" version = "0.1.0" +dependencies = [ + "docopt 0.6.67 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "aho-corasick" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "docopt" +version = "0.6.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "regex 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-serialize 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libc" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "memchr" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "rustc-serialize" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "strsim" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/Cargo.toml b/Cargo.toml index 10572b4..e590353 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,3 +2,6 @@ name = "rust-101" version = "0.1.0" authors = ["Ralf Jung "] + +[dependencies] +docopt = "*" diff --git a/solutions/src/rgrep.rs b/solutions/src/rgrep.rs index a3b74cc..316e6f0 100644 --- a/solutions/src/rgrep.rs +++ b/solutions/src/rgrep.rs @@ -1,5 +1,5 @@ use std::io::prelude::*; -use std::{io, fs, thread, process}; +use std::{io, fs, thread, process, cmp}; use std::sync::mpsc::{sync_channel, SyncSender, Receiver}; use std::sync::Arc; @@ -23,6 +23,17 @@ struct Line { line: usize, } +impl PartialEq for Line { + fn eq(&self, other: &Line) -> bool { + self.data.eq(&other.data) + } +} +impl PartialOrd for Line { + fn partial_cmp(&self, other: &Line) -> Option { + self.data.partial_cmp(&other.data) + } +} + fn read_files(options: Arc, out_channel: SyncSender) { for (fileidx, file) in options.files.iter().enumerate() { let file = fs::File::open(file).unwrap(); @@ -42,6 +53,33 @@ fn filter_lines(options: Arc, in_channel: Receiver, out_channel: } } +fn sort(data: &mut [T]) { + if data.len() < 2 { return; } + + let mut lpos = 1; + let mut rpos = data.len(); + // Invariant: pivot is data[0]; (0,lpos) is <= pivot; [rpos,len) is >= pivot; lpos < rpos + loop { + while lpos < rpos && data[lpos] <= data[0] { + lpos += 1; + } + while rpos > lpos && data[rpos-1] >= data[0] { + rpos -= 1; + } + if rpos == lpos { + break; + } + + data.swap(lpos, rpos-1); + } + + data.swap(0, lpos-1); // put pivot in the right place + + let (part1, part2) = data.split_at_mut(lpos); + sort(&mut part1[..lpos-1]); + sort(part2); +} + fn output_lines(options: Arc, in_channel: Receiver) { match options.output_mode { Print => { @@ -54,8 +92,11 @@ fn output_lines(options: Arc, in_channel: Receiver) { println!("{} hits for {}.", count, options.pattern); }, SortAndPrint => { - let _data: Vec = in_channel.iter().collect(); - unimplemented!() + let mut data: Vec = in_channel.iter().collect(); + sort(&mut data[..]); + for line in data.iter() { + println!("{}:{}: {}", options.files[line.file], line.line, line.data); + } } } } diff --git a/src/main.rs b/src/main.rs index 8526698..0290eba 100644 --- a/src/main.rs +++ b/src/main.rs @@ -80,7 +80,8 @@ // * [Part 10: Closures](part10.html) // * [Part 11: Trait Objects, Box, Rc, Lifetime bounds](part11.html) // * (to be continued) -#![allow(dead_code, unused_imports, unused_variables, unused_mut)] +#![allow(dead_code, unused_imports, unused_variables, unused_mut, unreachable_code)] +/* extern crate docopt; */ mod part00; mod part01; mod part02; @@ -94,14 +95,15 @@ mod part09; mod part10; mod part11; mod part12; +mod part13; // To actually run the code of some part (after filling in the blanks, if necessary), simply edit the `main` // function. - fn main() { part00::main(); } + // Additional material // ------------------- // diff --git a/src/part12.rs b/src/part12.rs index edcb9e0..477a3ae 100644 --- a/src/part12.rs +++ b/src/part12.rs @@ -14,17 +14,17 @@ use std::sync::Arc; // to complete the job: Which files to work on, which pattern to look for, and how to output.
// Besides just printing all the matching lines, we will also offer to count them, or alternatively to sort them. #[derive(Clone,Copy)] -enum OutputMode { +pub enum OutputMode { Print, SortAndPrint, Count, } use self::OutputMode::*; -struct Options { - files: Vec, - pattern: String, - output_mode: OutputMode, +pub struct Options { + pub files: Vec, + pub pattern: String, + pub output_mode: OutputMode, } //@ Now we can write three functions to do the actual job of reading, matching, and printing, respectively. @@ -87,7 +87,7 @@ fn output_lines(options: Arc, in_channel: Receiver) { }, SortAndPrint => { // We are asked to sort the matching lines before printing. So let's collect them all in a local vector... - let data: Vec = in_channel.iter().collect(); + let mut data: Vec = in_channel.iter().collect(); // ...and implement the actual sorting later. unimplemented!() } @@ -96,7 +96,7 @@ fn output_lines(options: Arc, in_channel: Receiver) { // With the operations of the three threads defined, we can now implement a function that performs grepping according // to some given options. -fn run(options: Options) { +pub fn run(options: Options) { // We move the `options` into an `Arc`, as that's what the thread workers expect. let options = Arc::new(options); @@ -125,7 +125,7 @@ fn run(options: Options) { handle3.join().unwrap(); } -// Now we have all the pieces together for testing our `rgrep` with some hard-coded options. +// Now we have all the pieces together for testing our rgrep with some hard-coded options. //@ We need to call `to_string` on string literals to convert them to a fully-owned `String`. pub fn main() { let options = Options { @@ -136,7 +136,7 @@ pub fn main() { run(options); } -// **Exercise 12.1**: Change `rgrep` such that it prints now only the matching lines, but also the name of the file +// **Exercise 12.1**: Change rgrep such that it prints now only the matching lines, but also the name of the file // and the number of the line in the file. You will have to change the type of the channels from `String` to something // that records this extra information. diff --git a/src/part13.rs b/src/part13.rs new file mode 100644 index 0000000..0121079 --- /dev/null +++ b/src/part13.rs @@ -0,0 +1,145 @@ +// Rust-101, Part 13: Slices, Arrays, External Dependencies +// ================= + +//@ To complete rgrep, there are two pieces we still need to implement: Sorting, and taking the job options +//@ as argument to the program, rather than hard-coding them. Let's start with sorting. + +// ## Slices +//@ Again, we first have to think about the type we want to give to our sorting function. We may be inclined to +//@ pass it a `Vec`. Now, sorting does not actually consume the argument, so we could make that a `&mut Vec`. +//@ But there's a problem with that: If we want to implement some divide-and-conquer sorting algorithm (say, +//@ Quicksort), then we will have to *split* our argument at some point, and operate recursively on the two parts. +//@ But we can't split a `Vec`! We could now extend the function signature to also take some indices, marking the +//@ part of the vector we are supposed to sort, but that's all rather clumsy. Rust offers a nicer solution. +//@ +//@ `[T]` is the type of an (unsized) *array*, with elements of type `T`. All this means is that there's a contiguous +//@ region of memory, where a bunch of `T` are stored. How many` We can't tell! This is an unsized type. Just like for +//@ trait objects, this means we can only operate on pointers to that type, and these pointers will containing the missing +//@ information - namely, the length. Such a pointer is called a *slice*. As we will see, a slice can be split! +//@ Our function can thus take a borrowed slice, and promise to sort all elements in there. +pub fn sort(data: &mut [T]) { + if data.len() < 2 { return; } + + // We decide that the element at 0 is our pivot, and then we move our cursors through the rest of the slice, + // making sure that everything on the left is no larger than the pivot, and everything on the right is no smaller. + let mut lpos = 1; + let mut rpos = data.len(); + /* Invariant: pivot is data[0]; everything with index (0,lpos) is <= pivot; [rpos,len) is >= pivot; lpos < rpos */ + loop { + // **Exercise 13.1**: Complete this Quicksort loop. You can use `swap` on slices to swap two elements. + unimplemented!() + } + + // Once our cursors met, we need to put the pivot in the right place. + data.swap(0, lpos-1); + + // Finally, we split our slice to sort the two halves. The nice part about slices is that splitting them is cheap: + //@ They are just a pointer to a start address, and a length. We can thus get two pointers, one at the beginning and + //@ one in the middle, and set the lengths appropriately such that they don't overlap. This is what `split_at_mut` does. + //@ Since the two slices don't overlap, there is no aliasing and we can have them both mutably borrowed. + let (part1, part2) = data.split_at_mut(lpos); + //@ The index operation can not only be used to address certain elements, it can also be used for "slicing": Giving a range + //@ of indices, and obtaining an appropriate part of the slice we started with. Here, we remove the last element from + //@ `part1`, which is the pivot. This makes sure both recursive calls work on strictly smaller slices. + sort(&mut part1[..lpos-1]); /*@*/ + sort(part2); /*@*/ +} + +// **Exercise 13.2*: Since `String` implements `PartialEq`, you can now change the function `output_lines` in the previous part +// to call the sort function above. If you did exercise 12.1, you will have slightly more work. Make sure you sort by the matched line +// only, not by filename or line number! + +// Now, we can sort, e.g., an vector of numbers. +fn sort_nums(data: &mut Vec) { + //@ Vectors support slicing, just like slices do. Here, `..` denotes the full range, which means we want to slice the entire vector. + //@ It is then passed to the `sort` function, which doesn't even know that it is working on data inside a vector. + sort(&mut data[..]); +} + +// ## Arrays +//@ An *array* in Rust is given be the type `[T; n]`, where `n` is some *fixed* number. So, `[f64; 10]` is an array of 10 floating-point +//@ numbers, all one right next to the other in memory. Arrays are sized, and hence can be used like any other type. But we can also +//@ borrow them as slices, e.g., to sort them. +fn sort_array() { + let mut data: [f64; 5] = [1.0, 3.4, 12.7, -9.12, 0.1]; + sort(&mut data); +} + +// ## External Dependencies +//@ This leaves us with just one more piece to complete rgrep: Taking arguments from the command-line. We could now directly work on +//@ [`std::env::args`](http://doc.rust-lang.org/beta/std/env/fn.args.html) to gain access to those arguments, and this would become +//@ a pretty boring lesson in string manipulation. Instead, I want to use this opportunity to show how easy it is to benefit from +//@ other people's work in your program. +//@ +//@ For sure, we are not the first to equip a Rust program with support for command-line arguments. Someone must have written a library +//@ for the job, right? Indeed, someone has. Rust has a central repository of published libraries, called [crates.io](https://crates.io/). +//@ It's a bit like [PyPI](https://pypi.python.org/pypi) or the [Ruby Gems](https://rubygems.org/): Everybody can upload their code, +//@ and there's tooling for importing that code into your project. This tooling is provided by `cargo`, the tool we are already using to +//@ build this tutorial. (`cargo` also has support for *publishing* your crate on crates.io, I refer you to [the documentation](http://doc.crates.io/crates-io.html) for more details.) +//@ In this case, we are going to use the [`docopt` crate](https://crates.io/crates/docopt), which creates a parser for command-line +//@ arguments based on the usage string. External dependencies are declared in the `Cargo.toml` file. + +//@ I already prepared that file, but the declaration of the dependency is still commented out. So please open `Cargo.toml` of your workspace +//@ now, and enabled the two commented-out lines. Then do `cargo build`. Cargo will now download the crate from crates.io, compile it, +//@ and link it to your program. In the future, you can do `cargo update` to make it download new versions of crates you depend on. +//@ Note that crates.io is only the default location for dependencies, you can also give it the URL of a git repository or some local +//@ path. All of this is explained in the [Cargo Guide](http://doc.crates.io/guide.html). + +// I disabled the following module (using a rather bad hack), because it only compiles if `docopt` is linked. However, before enabling it, +// you still have get the external library into the global namespace. This is done with `extern crate docopt;`, and that statement *has* to be +// in `main.rs`. So please go there, and enable this commented-out line. Then remove the attribute of the following module. +#[cfg(feature = "disabled")] +pub mod rgrep { + // Now that `docopt` is linked and declared in `main.rs`, we can import it with `use`. We also import some other pieces that we will need. + use docopt::Docopt; + use part12::{run, Options, OutputMode}; + use std::process; + + // The USAGE string documents how the program is to be called. It's written in a format that `docopt` can parse. + static USAGE: &'static str = " +Usage: rgrep [-c] [-s] ... + +Options: + -c, --count Count number of matching lines (rather than printing them). + -s, --sort Sort the lines before printing. +"; + + // This function extracts the rgrep options from the command-line arguments. + fn get_options() -> Options { + // Parse argv and exit the program with an error message if it fails. This is taken from the [`docopt` documentation](http://burntsushi.net/rustdoc/docopt/). + let args = Docopt::new(USAGE).and_then(|d| d.parse()).unwrap_or_else(|e| e.exit()); + // Now we can get all the values out. + let count = args.get_bool("-c"); + let sort = args.get_bool("-s"); + let pattern = args.get_str(""); + let files = args.get_vec(""); + if count && sort { + println!("Setting both '-c' and '-s' at the same time does not make any sense."); + process::exit(1); + } + + // We need to make the strings owned to construct the `Options` instance. + //@ If you check all the type carefully, you will notice that `pattern` above if of type `&str`. `str` is the type of a UTF-8 encoded string, that is, a bunch of + //@ bytes in memory (`[u8]`) that are valid according of UTF-8. `str` is unsized. `&str` is a sliced string, and stores the address of the character data, and + //@ their length. String literals like "this one" are of type `&'static str`: They point right to the constant section of the binary, you you cannot claim you + //@ own them. However, the borrow is valid for as long as the program runs, hence it has lifetime `'static`. Calling `to_string` will copy the string data + //@ into an owned buffer on the heap, and thus convert it to `String`. + Options { + files: files.iter().map(|file| file.to_string()).collect(), + pattern: pattern.to_string(), + output_mode: if count { OutputMode::Count } else if sort { OutputMode::SortAndPrint } else { OutputMode::Print }, + } + } + + // Finally, we can call the `run` function from the previous part on the options extracted using `get_options`. Edit `main.rs` to call this function. + // You can now use `cargo run -- ` to call your program, and see the argument parser and the threads we wrote previously in action! + pub fn main() { + run(get_options()); + } +} + +// **Exercise 13.3**: Wouldn't it be nice if rgrep supported regular expressions? There's already a crate that does all the parsing and matching on regular +// expression, it's called [regex](https://crates.io/crates/regex). Add this crate to the dependencies of your workspace, add an option ("-r") to switch +// the pattern to regular-expression mode, and change `filter_lines` to honor this option. The documentation of regex is available from its crates.io site. + +//@ [index](main.html) | [previous](part12.html) | [next](main.html) diff --git a/workspace/Cargo.toml b/workspace/Cargo.toml index a038197..81228f7 100644 --- a/workspace/Cargo.toml +++ b/workspace/Cargo.toml @@ -1,3 +1,6 @@ [package] name = "rust-101-workspace" version = "0.0.0" + +#[dependencies] +#docopt = "*" diff --git a/workspace/src/main.rs b/workspace/src/main.rs index 98e8e8d..7e7c200 100644 --- a/workspace/src/main.rs +++ b/workspace/src/main.rs @@ -1,4 +1,4 @@ -#![allow(dead_code, unused_imports, unused_variables, unused_mut)] +#![allow(dead_code, unused_imports, unused_variables, unused_mut, unreachable_code)] // Only the files imported here will be compiled. mod part00; @@ -14,6 +14,7 @@ mod part09; mod part10; mod part11; mod part12; +mod part13; // This decides which part is actually run. fn main() { diff --git a/workspace/src/part12.rs b/workspace/src/part12.rs index 1d75bfd..84d47ec 100644 --- a/workspace/src/part12.rs +++ b/workspace/src/part12.rs @@ -11,17 +11,17 @@ use std::sync::Arc; // to complete the job: Which files to work on, which pattern to look for, and how to output.
// Besides just printing all the matching lines, we will also offer to count them, or alternatively to sort them. #[derive(Clone,Copy)] -enum OutputMode { +pub enum OutputMode { Print, SortAndPrint, Count, } use self::OutputMode::*; -struct Options { - files: Vec, - pattern: String, - output_mode: OutputMode, +pub struct Options { + pub files: Vec, + pub pattern: String, + pub output_mode: OutputMode, } @@ -70,7 +70,7 @@ fn output_lines(options: Arc, in_channel: Receiver) { }, SortAndPrint => { // We are asked to sort the matching lines before printing. So let's collect them all in a local vector... - let data: Vec = in_channel.iter().collect(); + let mut data: Vec = in_channel.iter().collect(); // ...and implement the actual sorting later. unimplemented!() } @@ -79,7 +79,7 @@ fn output_lines(options: Arc, in_channel: Receiver) { // With the operations of the three threads defined, we can now implement a function that performs grepping according // to some given options. -fn run(options: Options) { +pub fn run(options: Options) { // We move the `options` into an `Arc`, as that's what the thread workers expect. let options = Arc::new(options); @@ -105,7 +105,7 @@ fn run(options: Options) { handle3.join().unwrap(); } -// Now we have all the pieces together for testing our `rgrep` with some hard-coded options. +// Now we have all the pieces together for testing our rgrep with some hard-coded options. pub fn main() { let options = Options { files: vec!["src/part10.rs".to_string(), "src/part11.rs".to_string(), "src/part12.rs".to_string()], @@ -115,7 +115,7 @@ pub fn main() { run(options); } -// **Exercise 12.1**: Change `rgrep` such that it prints now only the matching lines, but also the name of the file +// **Exercise 12.1**: Change rgrep such that it prints now only the matching lines, but also the name of the file // and the number of the line in the file. You will have to change the type of the channels from `String` to something // that records this extra information. diff --git a/workspace/src/part13.rs b/workspace/src/part13.rs new file mode 100644 index 0000000..3ef7785 --- /dev/null +++ b/workspace/src/part13.rs @@ -0,0 +1,96 @@ +// Rust-101, Part 13: Slices, Arrays, External Dependencies +// ================= + + +// ## Slices +pub fn sort(data: &mut [T]) { + if data.len() < 2 { return; } + + // We decide that the element at 0 is our pivot, and then we move our cursors through the rest of the slice, + // making sure that everything on the left is no larger than the pivot, and everything on the right is no smaller. + let mut lpos = 1; + let mut rpos = data.len(); + /* Invariant: pivot is data[0]; everything with index (0,lpos) is <= pivot; [rpos,len) is >= pivot; lpos < rpos */ + loop { + // **Exercise 13.1**: Complete this Quicksort loop. You can use `swap` on slices to swap two elements. + unimplemented!() + } + + // Once our cursors met, we need to put the pivot in the right place. + data.swap(0, lpos-1); + + // Finally, we split our slice to sort the two halves. The nice part about slices is that splitting them is cheap: + let (part1, part2) = data.split_at_mut(lpos); + unimplemented!() +} + +// **Exercise 13.2*: Since `String` implements `PartialEq`, you can now change the function `output_lines` in the previous part +// to call the sort function above. If you did exercise 12.1, you will have slightly more work. Make sure you sort by the matched line +// only, not by filename or line number! + +// Now, we can sort, e.g., an vector of numbers. +fn sort_nums(data: &mut Vec) { + sort(&mut data[..]); +} + +// ## Arrays +fn sort_array() { + let mut data: [f64; 5] = [1.0, 3.4, 12.7, -9.12, 0.1]; + sort(&mut data); +} + +// ## External Dependencies + + +// I disabled the following module (using a rather bad hack), because it only compiles if `docopt` is linked. However, before enabling it, +// you still have get the external library into the global namespace. This is done with `extern crate docopt;`, and that statement *has* to be +// in `main.rs`. So please go there, and enable this commented-out line. Then remove the attribute of the following module. +#[cfg(feature = "disabled")] +pub mod rgrep { + // Now that `docopt` is linked and declared in `main.rs`, we can import it with `use`. We also import some other pieces that we will need. + use docopt::Docopt; + use part12::{run, Options, OutputMode}; + use std::process; + + // The USAGE string documents how the program is to be called. It's written in a format that `docopt` can parse. + static USAGE: &'static str = " +Usage: rgrep [-c] [-s] ... + +Options: + -c, --count Count number of matching lines (rather than printing them). + -s, --sort Sort the lines before printing. +"; + + // This function extracts the rgrep options from the command-line arguments. + fn get_options() -> Options { + // Parse argv and exit the program with an error message if it fails. This is taken from the [`docopt` documentation](http://burntsushi.net/rustdoc/docopt/). + let args = Docopt::new(USAGE).and_then(|d| d.parse()).unwrap_or_else(|e| e.exit()); + // Now we can get all the values out. + let count = args.get_bool("-c"); + let sort = args.get_bool("-s"); + let pattern = args.get_str(""); + let files = args.get_vec(""); + if count && sort { + println!("Setting both '-c' and '-s' at the same time does not make any sense."); + process::exit(1); + } + + // We need to make the strings owned to construct the `Options` instance. + Options { + files: files.iter().map(|file| file.to_string()).collect(), + pattern: pattern.to_string(), + output_mode: if count { OutputMode::Count } else if sort { OutputMode::SortAndPrint } else { OutputMode::Print }, + } + } + + // Finally, we can call the `run` function from the previous part on the options extracted using `get_options`. Edit `main.rs` to call this function. + // You can now use `cargo run -- ` to call your program, and see the argument parser and the threads we wrote previously in action! + pub fn main() { + run(get_options()); + } +} + +// **Exercise 13.3**: Wouldn't it be nice if rgrep supported regular expressions? There's already a crate that does all the parsing and matching on regular +// expression, it's called [regex](https://crates.io/crates/regex). Add this crate to the dependencies of your workspace, add an option ("-r") to switch +// the pattern to regular-expression mode, and change `filter_lines` to honor this option. The documentation of regex is available from its crates.io site. +