diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..008b35d --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +/_build +/cover +/deps +/doc +erl_crash.dump +*.ez diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..d48a5b1 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,12 @@ +language: elixir +elixir: + - 1.3.0 +sudo: false # to use faster container based build environment +notifications: + recipients: + - jose.valim@plataformatec.com.br +otp_release: + - 18.0 +after_script: + - mix deps.get --only docs + - MIX_ENV=docs mix inch.report diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d942ec4 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +# Changelog + +## v0.11.0 + +Extracted from GenStage. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0599486 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# Flow + +`Flow` allows developers to express computations on collections, similar to the `Enum` and `Stream` modules, although computations will be executed in parallel using multiple `GenStage`s. + +## Installation + +Flow requires Elixir v1.3. Add `:flow` to your list of dependencies in mix.exs: + + def deps do + [{:flow, "~> 0.11"}] + end + +## License + +Same as Elixir. diff --git a/lib/flow.ex b/lib/flow.ex new file mode 100644 index 0000000..5e7e32c --- /dev/null +++ b/lib/flow.ex @@ -0,0 +1,1425 @@ +defmodule Flow do + @moduledoc ~S""" + Computational flows with stages. + + `Flow` allows developers to express computations + on collections, similar to the `Enum` and `Stream` modules, + although computations will be executed in parallel using + multiple `GenStage`s. + + Flow was also designed to work with both bounded (finite) + and unbounded (infinite) data. By default, Flow will work + with batches of 500 items. This means Flow will only show + improvements when working with larger collections. However, + for certain cases, such as IO-bound flows, a smaller batch size + can be configured through the `:min_demand` and `:max_demand` + options supported by `from_enumerable/2`, `from_stages/2` + and `partition/3`. + + Flow also provides the concepts of "windows" and "triggers", + which allow developers to split the data into arbitrary + windows according to event time. Triggers allow computations + to be materialized at different intervals, allowing developers + to peek at results as they are computed. + + This README will cover the main constructs and concepts behind + Flow with examples. There is also a presentation about GenStage + and Flow from José Valim at ElixirConf 2016, which also covers + data processing concepts for those unfamilar with the domain: + https://youtu.be/srtMWzyqdp8?t=244 + + ## Example + + As an example, let's implement the classic word counting + algorithm using flow. The word counting program will receive + one file and count how many times each word appears in the + document. Using the `Enum` module it could be implemented + as follows: + + File.stream!("path/to/some/file") + |> Enum.flat_map(&String.split(&1, " ")) + |> Enum.reduce(%{}, fn word, acc -> + Map.update(acc, word, 1, & &1 + 1) + end) + |> Enum.to_list() + + Unfortunately, the implementation above is not very efficient, + as `Enum.flat_map/2` will build a list with all the words in + the document before reducing it. If the document is, for example, + 2GB, we will load 2GB of data into memory. + + We can improve the solution above by using the Stream module: + + File.stream!("path/to/some/file") + |> Stream.flat_map(&String.split(&1, " ")) + |> Enum.reduce(%{}, fn word, acc -> + Map.update(acc, word, 1, & &1 + 1) + end) + |> Enum.to_list() + + Now instead of loading the whole set into memory, we will only + keep the current line in memory while we process it. While this + allows us to process the whole data set efficiently, it does + not leverage concurrency. Flow solves that: + + File.stream!("path/to/some/file") + |> Flow.from_enumerable() + |> Flow.flat_map(&String.split(&1, " ")) + |> Flow.partition() + |> Flow.reduce(fn -> %{} end, fn word, acc -> + Map.update(acc, word, 1, & &1 + 1) + end) + |> Enum.to_list() + + To convert from Stream to Flow, we have made two changes: + + 1. We have replaced the calls to `Stream` with `Flow` + 2. We call `partition/1` so words are properly partitioned between stages + + The example above will use all available cores and will + keep an ongoing flow of data instead of traversing them + line by line. Once all data is computed, it is sent to the + process which invoked `Enum.to_list/1`. + + While we gain concurrency by using Flow, many of the benefits + of Flow are in partitioning the data. We will discuss + the need for data partitioning next. + + ## Partitioning + + To understand the need to partition the data, let's change the + example above and remove the partition call: + + File.stream!("path/to/some/file") + |> Flow.from_enumerable() + |> Flow.flat_map(&String.split(&1, " ")) + |> Flow.reduce(fn -> %{} end, fn word, acc -> + Map.update(acc, word, 1, & &1 + 1) + end) + |> Enum.to_list() + + This will execute the `flat_map` and `reduce` + operations in parallel inside multiple stages. When running + on a machine with two cores: + + [file stream] # Flow.from_enumerable/1 (producer) + | | + [M1] [M2] # Flow.flat_map/2 + Flow.reduce/3 (consumer) + + Now imagine that the `M1` and `M2` stages above receive the + following lines: + + M1 - "roses are red" + M2 - "violets are blue" + + `flat_map/2` will break them into: + + M1 - ["roses", "are", "red"] + M2 - ["violets", "are", "blue"] + + Then `reduce/3` will result in each stage having the following state: + + M1 - %{"roses" => 1, "are" => 1, "red" => 1} + M2 - %{"violets" => 1, "are" => 1, "blue" => 1} + + Which is converted to the list (in no particular order): + + [{"roses", 1}, + {"are", 1}, + {"red", 1}, + {"violets", 1}, + {"are", 1}, + {"blue", 1}] + + Although both stages have performed word counting, we have words + like "are" that appear on both stages. This means we would need + to perform yet another pass on the data merging the duplicated + words across stages. + + Partitioning solves this by introducing a new set of stages and + making sure the same word is always mapped to the same stage + with the help of a hash function. Let's introduce the call to + `partition/1` back: + + File.stream!("path/to/some/file") + |> Flow.from_enumerable() + |> Flow.flat_map(&String.split(&1, " ")) + |> Flow.partition() + |> Flow.reduce(fn -> %{} end, fn word, acc -> + Map.update(acc, word, 1, & &1 + 1) + end) + |> Enum.to_list() + + Now we will have the following topology: + + [file stream] # Flow.from_enumerable/1 (producer) + | | + [M1] [M2] # Flow.flat_map/2 (producer-consumer) + |\ /| + | \/ | + |/ \ | + [R1] [R2] # Flow.reduce/3 (consumer) + + If the `M1` and `M2` stages receive the same lines and break + them into words as before: + + M1 - ["roses", "are", "red"] + M2 - ["violets", "are", "blue"] + + Now, any given word will be consistently routed to `R1` or `R2` + regardless of its origin. The default hashing function will route + them like this: + + R1 - ["roses", "are", "red", "are"] + R2 - ["violets", "blue"] + + Resulting in the reduced state of: + + R1 - %{"roses" => 1, "are" => 2, "red" => 1} + R2 - %{"violets" => 1, "blue" => 1} + + Which is converted to the list (in no particular order): + + [{"roses", 1}, + {"are", 2}, + {"red", 1}, + {"violets", 1}, + {"blue", 1}] + + Each stage has a distinct subset of the data so we know + that we don't need to merge the data later on, because a given + word is guaranteed to have only been routed to one stage. + + Partitioning the data is a very useful technique. For example, + if we wanted to count the number of unique elements in a dataset, + we could perform such a count in each partition and then sum + their results, as the partitioning guarantees the data in + each partition won't overlap. A unique element would never + be counted twice. + + The topology above alongside partitioning is very common in + the MapReduce programming model which we will briefly discuss + next. + + ### MapReduce + + The MapReduce programming model forces us to break our computations + in two stages: map and reduce. The map stage is often quite easy to + parallelize because events are processed individually and in isolation. + The reduce stages need to group the data either partially or completely. + + In the example above, the stages executing `flat_map/2` are the + mapper stages. Because the `flat_map/2` function works line by line, + we can have two, four, eight or more mapper processes that will + break line by line into words without any need for coordination. + + However, the reducing stage is a bit more complicated. Reducer + stages typically aggregate some result based on their inputs, such + as how many times a word has appeared. This implies reducer + computations need to traverse the whole data set and, in order + to do so in parallel, we partition the data into distinct + datasets. + + The goal of the `reduce/3` operation is to accumulate a value + which then becomes the partition state. Any operation that + happens after `reduce/3` works on the whole state and is only + executed after all the data for a partition is collected. + + While this approach works well for bounded (finite) data, it + is quite limited for unbounded (infinite) data. After all, if + the reduce operation needs to traverse the whole partition to + complete, how can we do so if the data never finishes? + + To answer this question, we need to talk about data completion, + triggers and windows. + + ## Data completion, windows and triggers + + By default, Flow uses `GenStage`'s notification system to notify + stages when a producer has emitted all events. This is done + automatically by Flow when using `from_enumerable/2`. Custom + producers can also send such notifications by calling + `GenStage.async_notification/2` from themselves: + + # In the case all the data is done + GenStage.async_notification(self(), {:producer, :done}) + + # In the case the producer halted due to an external factor + GenStage.async_notification(self(), {:producer, :halt}) + + However, when working with an unbounded stream of data, there is + no such thing as data completion. So when can we consider a reduce + function to be "completed"? + + To handle such cases, Flow provides windows and triggers. Windows + allow us to split the data based on the event time while triggers + tells us when to write the results we have computed so far. By + introducing windows, we no longer think the events are partitioned + across stages. Instead each event belongs to a window and the window + is partitioned across the stages. + + By default, all events belong to the same window (called the global + window) which is partitioned across stages. However, different + windowing strategies can be used by building a `Flow.Window` + and passing it to the `Flow.partition/3` function. + + Once a window is specified, we can create triggers that tell us + when to checkpoint the data, allowing us to report our progress + while the data streams through the system, regardless if the data + is bounded or unbounded. + + Windows and triggers effectively control how the `reduce/3` function + works. `reduce/3` is invoked per window while a trigger configures + when `reduce/3` halts so we can checkpoint the data before resuming + the computation with an old or new accumulator. See `Flow.Window` + for a complete introduction into windows and triggers. + + ## Supervisable flows + + In the examples so far we have started a flow dynamically + and consumed it using `Enum.to_list/1`. Unfortunately calling + a function from `Enum` will cause the whole computed dataset + to be sent to a single process. + + In many situations, this is either too expensive or completely + undesirable. For example, in data-processing pipelines, it is + common to receive data continuously from external sources. At + the end, this data is written to disk or another storage mechanism + after being processed, rather than being sent to a single process. + + Flow allows computations to be started as a group of processes + which may run indefinitely. This can be done by starting + the flow as part of a supervision tree using `Flow.start_link/2`. + `Flow.into_stages/3` can also be used to start the flow as a + linked process which will send the events to the given consumers. + + ## Performance discussions + + In this section we will discuss points related to performance + with flows. + + ### Know your code + + There are many optimizations we could perform in the flow above + that are not necessarily related to flows themselves. Let's rewrite + the flow using some of them: + + # The parent process which will own the table + parent = self() + + # Let's compile common patterns for performance + empty_space = :binary.compile_pattern(" ") # BINARY + + File.stream!("path/to/some/file", read_ahead: 100_000) # READ_AHEAD + |> Flow.from_enumerable() + |> Flow.flat_map(&String.split(&1, empty_space)) # BINARY + |> Flow.partition() + |> Flow.reduce(fn -> :ets.new(:words, []) end, fn word, ets -> # ETS + :ets.update_counter(ets, word, {2, 1}, {word, 0}) + ets + end) + |> Flow.map_state(fn ets -> # ETS + :ets.give_away(ets, parent, []) + [ets] + end) + |> Enum.to_list() + + We have performed three optimizations: + + * BINARY - the first optimization is to compile the pattern we use + to split the string on + + * READ_AHEAD - the second optimization is to use the `:read_ahead` + option for file streams allowing us to do fewer IO operations by + reading large chunks of data at once + + * ETS - the third stores the data in a ETS table and uses its counter + operations. For counters and a large dataset this provides a great + performance benefit as it generates less garbage. At the end, we + call `map_state/2` to transfer the ETS table to the parent process + and wrap the table in a list so we can access it on `Enum.to_list/1`. + This step is not strictly required. For example, one could write the + table to disk with `:ets.tab2file/2` at the end of the computation + + ### Configuration (demand and the number of stages) + + `from_enumerable/2`, `from_stages/2` and `partition/3` allow a set of + options to configure how flows work. In particular, we recommend that + developers play with the `:min_demand` and `:max_demand` options, which + control the amount of data sent between stages. The difference between + `max_demand` and `min_demand` works as the batch size when the producer + is full. If the producer has fewer events than requested by consumers, + it usually sends the remaining events available. + + If stages perform IO, it may also be worth increasing + the number of stages. The default value is `System.schedulers_online/0`, + which is a good default if the stages are CPU bound, but if stages + are waiting on external resources or other processes, increasing the + number of stages may be helpful. + + ### Avoid single sources + + In the examples so far we have used a single file as our data + source. In practice such should be avoided as the source could + end up being the bottleneck of our whole computation. + + In the file stream case above, instead of having one single + large file, it is preferable to break the file into smaller + ones: + + streams = for file <- File.ls!("dir/with/files") do + File.stream!("dir/with/files/#{file}", read_ahead: 100_000) + end + + streams + |> Flow.from_enumerables() + |> Flow.flat_map(&String.split(&1, " ")) + |> Flow.partition() + |> Flow.reduce(fn -> %{} end, fn word, acc -> + Map.update(acc, word, 1, & &1 + 1) + end) + |> Enum.to_list() + + Instead of calling `from_enumerable/1`, we now called + `from_enumerables/1` which expects a list of enumerables to + be used as source. Notice every stream also uses the `:read_ahead` + option which tells Elixir to buffer file data in memory to + avoid multiple IO lookups. + + If the number of enumerables is equal to or greater than the number of + cores, Flow will automatically fuse the enumerables with the mapper + logic. For example, if three file streams are given as enumerables + to a machine with two cores, we will have the following topology: + + [F1][F2][F3] # file stream + [M1][M2][M3] # Flow.flat_map/2 (producer) + |\ /\ /| + | /\/\ | + |// \\| + [R1][R2] # Flow.reduce/3 (consumer) + + """ + + defstruct producers: nil, window: nil, options: [], operations: [] + @type t :: %Flow{producers: producers, operations: [operation], + options: keyword(), window: Flow.Window.t} + + @typep producers :: nil | + {:stages, GenStage.stage} | + {:enumerables, Enumerable.t} | + {:join, t, t, fun(), fun(), fun()} | + {:flows, [t]} + + @typep operation :: {:mapper, atom(), [term()]} | + {:partition, keyword()} | + {:map_state, fun()} | + {:reduce, fun(), fun()} | + {:window, Flow.Window.t} + + ## Building + + @doc """ + Starts a flow with the given enumerable as the producer. + + Calling this function is equivalent to: + + Flow.from_enumerable([enumerable], options) + + The enumerable is consumed in batches, retrieving `max_demand` + items the first time and then `max_demand - min_demand` the + next times. Therefore, for streams that cannot produce items + that fast, it is recommended to pass a lower `:max_demand` + value as an option. + + ## Examples + + "some/file" + |> File.stream!(read_ahead: 100_000) + |> Flow.from_enumerable() + + some_network_based_stream() + |> Flow.from_enumerable(max_demand: 20) + + """ + @spec from_enumerable(Enumerable.t, keyword) :: t + def from_enumerable(enumerable, options \\ []) + + def from_enumerable(%Flow{}, _options) do + raise ArgumentError, "passing a Flow to Flow.from_enumerable/2 is not supported. " <> + "Did you mean to use Flow.partition/2 or Flow.merge/2?" + end + + def from_enumerable(enumerable, options) do + from_enumerables([enumerable], options) + end + + @doc """ + Starts a flow with the given enumerable as producer. + + The enumerable is consumed in batches, retrieving `max_demand` + items the first time and then `max_demand - min_demand` the + next times. Therefore, for streams that cannot produce items + that fast, it is recommended to pass a lower `:max_demand` + value as an option. + + See `GenStage.from_enumerable/2` for information and + limitations on enumerable-based stages. + + ## Options + + These options configure the stages connected to producers before partitioning. + + * `:window` - a window to run the next stages in, see `Flow.Window` + * `:stages` - the number of stages + * `:buffer_keep` - how the buffer should behave, see `c:GenStage.init/1` + * `:buffer_size` - how many events to buffer, see `c:GenStage.init/1` + + All remaining options are sent during subscription, allowing developers + to customize `:min_demand`, `:max_demand` and others. + + ## Examples + + files = [File.stream!("some/file1", read_ahead: 100_000), + File.stream!("some/file2", read_ahead: 100_000), + File.stream!("some/file3", read_ahead: 100_000)] + Flow.from_enumerables(files) + """ + @spec from_enumerables([Enumerable.t], keyword) :: t + def from_enumerables(enumerables, options \\ []) + + def from_enumerables([_ | _] = enumerables, options) do + options = stages(options) + {window, options} = Keyword.pop(options, :window, Flow.Window.global) + %Flow{producers: {:enumerables, enumerables}, options: options, window: window} + end + def from_enumerables(enumerables, _options) do + raise ArgumentError, "from_enumerables/2 expects a non-empty list as argument, got: #{inspect enumerables}" + end + + @doc """ + Starts a flow with the given stage as producer. + + Calling this function is equivalent to: + + Flow.from_stages([stage], options) + + See `from_stages/2` for more information. + + ## Examples + + Flow.from_stage(MyStage) + + """ + @spec from_stage(GenStage.stage, keyword) :: t + def from_stage(stage, options \\ []) do + from_stages([stage], options) + end + + @doc """ + Starts a flow with the list of stages as producers. + + ## Options + + These options configure the stages connected to producers before partitioning. + + * `:window` - a window to run the next stages in, see `Flow.Window` + * `:stages` - the number of stages + * `:buffer_keep` - how the buffer should behave, see `c:GenStage.init/1` + * `:buffer_size` - how many events to buffer, see `c:GenStage.init/1` + + All remaining options are sent during subscription, allowing developers + to customize `:min_demand`, `:max_demand` and others. + + ## Examples + + stages = [pid1, pid2, pid3] + Flow.from_stages(stages) + + ## Termination + + Producer stages can signal the flow that it has emitted all + events by emitting a notification using `GenStage.async_notification/2` + from themselves: + + # In the case all the data is done + GenStage.async_notification(self(), {:producer, :done}) + + # In the case the producer halted due to an external factor + GenStage.async_notification(self(), {:producer, :halt}) + + Your producer may also keep track of all consumers and automatically + shut down when all consumers have exited. + """ + @spec from_stages([GenStage.stage], keyword) :: t + def from_stages(stages, options \\ []) + + def from_stages([_ | _] = stages, options) do + options = stages(options) + {window, options} = Keyword.pop(options, :window, Flow.Window.global) + %Flow{producers: {:stages, stages}, options: options, window: window} + end + def from_stages(stages, _options) do + raise ArgumentError, "from_stages/2 expects a non-empty list as argument, got: #{inspect stages}" + end + + @joins [:inner, :left_outer, :right_outer, :full_outer] + + @doc """ + Joins two bounded (finite) flows. + + It expects the `left` and `right` flow, the `left_key` and + `right_key` to calculate the key for both flows and the `join` + function which is invoked whenever there is a match. + + A join creates a new partitioned flow that subscribes to the + two flows given as arguments. The newly created partitions + will accumulate the data received from both flows until there + is no more data. Therefore, this function is useful for merging + finite flows. If used for merging infinite flows, you will + eventually run out of memory due to the accumulated data. See + `window_join/8` for applying a window to a join, allowing the + join data to be reset per window. + + The join has 4 modes: + + * `:inner` - data will only be emitted when there is a match + between the keys in left and right side + * `:left_outer` - similar to `:inner` plus all items given + in the left that did not have a match will be emitted at the + end with `nil` for the right value + * `:right_outer` - similar to `:inner` plus all items given + in the right that did not have a match will be emitted at the + end with `nil` for the left value + * `:full_outer` - similar to `:inner` plus all items given + in the left and right that did not have a match will be emitted + at the end with `nil` for the right and left value respectively + + The joined partitions can be configured via `options` with the + same values as shown on `from_enumerable/2` or `from_stages/2`. + + ## Examples + + iex> posts = [%{id: 1, title: "hello"}, %{id: 2, title: "world"}] + iex> comments = [{1, "excellent"}, {1, "outstanding"}, + ...> {2, "great follow up"}, {3, "unknown"}] + iex> flow = Flow.bounded_join(:inner, + ...> Flow.from_enumerable(posts), + ...> Flow.from_enumerable(comments), + ...> & &1.id, # left key + ...> & elem(&1, 0), # right key + ...> fn post, {_post_id, comment} -> Map.put(post, :comment, comment) end) + iex> Enum.sort(flow) + [%{id: 1, title: "hello", comment: "excellent"}, + %{id: 2, title: "world", comment: "great follow up"}, + %{id: 1, title: "hello", comment: "outstanding"}] + + """ + @spec bounded_join(:inner | :left_outer | :right_outer | :outer, t, t, + fun(), fun(), fun(), keyword()) :: t + def bounded_join(mode, %Flow{} = left, %Flow{} = right, + left_key, right_key, join, options \\ []) + when is_function(left_key, 1) and is_function(right_key, 1) and + is_function(join, 2) and mode in @joins do + window_join(mode, left, right, Flow.Window.global, left_key, right_key, join, options) + end + + @doc """ + Joins two flows with the given window. + + It is similar to `bounded_join/7` with the addition a window + can be given. The window function applies to elements of both + left and right side in isolation (and not the joined value). A + trigger will cause the join state to be cleared. + + ## Examples + + As an example, let's expand the example given in `bounded_join/7` + and apply a window to it. The example in `bounded_join/7` returned + 3 results but in this example, because we will split the posts + and comments in two different windows, we will get only two results + as the later comment for `post_id=1` won't have a matching comment for + its window: + + iex> posts = [%{id: 1, title: "hello", timestamp: 0}, %{id: 2, title: "world", timestamp: 1000}] + iex> comments = [{1, "excellent", 0}, {1, "outstanding", 1000}, + ...> {2, "great follow up", 1000}, {3, "unknown", 1000}] + iex> window = Flow.Window.fixed(1, :second, fn + ...> {_, _, timestamp} -> timestamp + ...> %{timestamp: timestamp} -> timestamp + ...> end) + iex> flow = Flow.window_join(:inner, + ...> Flow.from_enumerable(posts), + ...> Flow.from_enumerable(comments), + ...> window, + ...> & &1.id, # left key + ...> & elem(&1, 0), # right key + ...> fn post, {_post_id, comment, _ts} -> Map.put(post, :comment, comment) end, + ...> stages: 1, max_demand: 1) + iex> Enum.sort(flow) + [%{id: 1, title: "hello", comment: "excellent", timestamp: 0}, + %{id: 2, title: "world", comment: "great follow up", timestamp: 1000}] + + """ + @spec window_join(:inner | :left_outer | :right_outer | :outer, t, t, Flow.Window.t, + fun(), fun(), fun(), keyword()) :: t + def window_join(mode, %Flow{} = left, %Flow{} = right, %{} = window, + left_key, right_key, join, options \\ []) + when is_function(left_key, 1) and is_function(right_key, 1) and + is_function(join, 2) and mode in @joins do + options = stages(options) + %Flow{producers: {:join, mode, left, right, left_key, right_key, join}, + options: options, window: window} + end + + @doc """ + Runs a given flow. + + This runs the given flow as a stream for its side-effects. No + items are sent from the flow to the current process. + + ## Examples + + iex> parent = self() + iex> [1, 2, 3] |> Flow.from_enumerable() |> Flow.each(&send(parent, &1)) |> Flow.run() + :ok + iex> receive do + ...> 1 -> :ok + ...> end + :ok + + """ + @spec run(t) :: :ok + def run(flow) do + [] = flow |> emit(:nothing) |> Enum.to_list() + :ok + end + + @doc """ + Starts and runs the flow as a separate process. + + See `into_stages/3` in case you want the flow to + work as a producer for another series of stages. + + ## Options + + * `:dispatcher` - the dispatcher responsible for handling demands. + Defaults to `GenStage.DemandDispatch`. May be either an atom or + a tuple with the dispatcher and the dispatcher options + + * `:demand` - configures the demand on the flow producers to `:forward` + or `:accumulate`. The default is `:forward`. See `GenStage.demand/2` + for more information. + + """ + @spec start_link(t, keyword()) :: GenServer.on_start + def start_link(flow, options \\ []) do + Flow.Coordinator.start_link(emit(flow, :nothing), :consumer, [], options) + end + + @doc """ + Starts and runs the flow as a separate process which + will be a producer to the given `consumers`. + + It expects a list of consumers to subscribe to. Each element + represents the consumer or a tuple with the consumer and the + subscription options as defined in `GenStage.sync_subscribe/2`. + + Receives the same options as `start_link/2`. + """ + @spec into_stages(t, consumers, keyword()) :: GenServer.on_start when + consumers: [GenStage.stage | {GenStage.stage, keyword()}] + def into_stages(flow, consumers, options \\ []) do + Flow.Coordinator.start_link(flow, :producer_consumer, consumers, options) + end + + ## Mappers + + @doc """ + Applies the given function to each input without modifying it. + + ## Examples + + iex> parent = self() + iex> [1, 2, 3] |> Flow.from_enumerable() |> Flow.each(&send(parent, &1)) |> Enum.sort() + [1, 2, 3] + iex> receive do + ...> 1 -> :ok + ...> end + :ok + + """ + @spec each(t, (term -> term)) :: t + def each(flow, each) when is_function(each, 1) do + add_operation(flow, {:mapper, :each, [each]}) + end + + @doc """ + Applies the given function filtering each input in parallel. + + ## Examples + + iex> flow = [1, 2, 3] |> Flow.from_enumerable() |> Flow.filter(& rem(&1, 2) == 0) + iex> Enum.sort(flow) # Call sort as we have no order guarantee + [2] + + """ + @spec filter(t, (term -> term)) :: t + def filter(flow, filter) when is_function(filter, 1) do + add_operation(flow, {:mapper, :filter, [filter]}) + end + + @doc """ + Applies the given function filtering and mapping each input in parallel. + + ## Examples + + iex> flow = [1, 2, 3] |> Flow.from_enumerable() |> Flow.filter_map(& rem(&1, 2) == 0, & &1 * 2) + iex> Enum.sort(flow) # Call sort as we have no order guarantee + [4] + + """ + @spec filter_map(t, (term -> term), (term -> term)) :: t + def filter_map(flow, filter, mapper) when is_function(filter, 1) and is_function(mapper, 1) do + add_operation(flow, {:mapper, :filter_map, [filter, mapper]}) + end + + @doc """ + Applies the given function mapping each input in parallel. + + ## Examples + + iex> flow = [1, 2, 3] |> Flow.from_enumerable() |> Flow.map(& &1 * 2) + iex> Enum.sort(flow) # Call sort as we have no order guarantee + [2, 4, 6] + + iex> flow = Flow.from_enumerables([[1, 2, 3], 1..3]) |> Flow.map(& &1 * 2) + iex> Enum.sort(flow) + [2, 2, 4, 4, 6, 6] + + """ + @spec map(t, (term -> term)) :: t + def map(flow, mapper) when is_function(mapper, 1) do + add_operation(flow, {:mapper, :map, [mapper]}) + end + + @doc """ + Maps over the given values in the stage state. + + It is expected the state to emit two-elements tuples, + such as list, maps, etc. + + ## Examples + + iex> flow = Flow.from_enumerable([foo: 1, foo: 2, bar: 3, foo: 4, bar: 5], stages: 1) + iex> flow |> Flow.group_by_key |> Flow.map_values(&Enum.sort/1) |> Enum.sort() + [bar: [3, 5], foo: [1, 2, 4]] + + """ + def map_values(flow, value_fun) when is_function(value_fun) do + map(flow, fn {key, value} -> {key, value_fun.(value)} end) + end + + @doc """ + Applies the given function mapping each input in parallel and + flattening the result, but only one level deep. + + ## Examples + + iex> flow = [1, 2, 3] |> Flow.from_enumerable() |> Flow.flat_map(fn(x) -> [x, x * 2] end) + iex> Enum.sort(flow) # Call sort as we have no order guarantee + [1, 2, 2, 3, 4, 6] + + """ + @spec flat_map(t, (term -> Enumerable.t)) :: t + def flat_map(flow, flat_mapper) when is_function(flat_mapper, 1) do + add_operation(flow, {:mapper, :flat_map, [flat_mapper]}) + end + + @doc """ + Applies the given function rejecting each input in parallel. + + ## Examples + + iex> flow = [1, 2, 3] |> Flow.from_enumerable() |> Flow.reject(& rem(&1, 2) == 0) + iex> Enum.sort(flow) # Call sort as we have no order guarantee + [1, 3] + + """ + @spec reject(t, (term -> term)) :: t + def reject(flow, filter) when is_function(filter, 1) do + add_operation(flow, {:mapper, :reject, [filter]}) + end + + ## Reducers + + @doc """ + Creates a new partition for the given flow with the given options + + Every time this function is called, a new partition + is created. It is typically recommended to invoke it + before a reducing function, such as `reduce/3`, so data + belonging to the same partition can be kept together. + + ## Examples + + flow |> Flow.partition(window: Flow.Global.window) + flow |> Flow.partition(stages: 4) + + ## Options + + * `:window` - a `Flow.Window` struct which controls how the + reducing function behaves, see `Flow.Window` for more information. + * `:stages` - the number of partitions (reducer stages) + * `:key` - the key to use when partitioning. It is a function + that receives a single argument: the event and must return its key. + To facilitate customization, `:key` also allows common values, such as + `{:elem, integer}` and `{:key, atom}`, to calculate the hash based on a + tuple or a map field. See the "Key shortcuts" section below. + * `:hash` - the hashing function. By default a hashing function is built + on the key but a custom one may be specified as described in + `GenStage.PartitionDispatcher` + * `:dispatcher` - by default, `partition/2` uses `GenStage.PartitionDispatcher` + with the given hash function but any other dispatcher can be given + * `:min_demand` - the minimum demand for this subscription + * `:max_demand` - the maximum demand for this subscription + + ## Key shortcuts + + The following shortcuts can be given to the `:hash` option: + + * `{:elem, pos}` - apply the hash function to the element + at position `pos` in the given tuple + + * `{:key, key}` - apply the hash function to the key of a given map + + """ + @spec partition(t, keyword()) :: t + def partition(flow, options \\ []) when is_list(options) do + merge([flow], options) + end + + @doc """ + Reduces windows over multiple partitions into a single stage. + + Once `departition/5` is called, computations no longer + happen concurrently until the data is once again partitioned. + + `departition/5` is typically invoked as the last step in a flow + to merge the state from all previous partitions per window. + + It requires a flow and three functions as arguments as + described: + + * the accumulator function - a zero-arity function that returns + the initial accumulator. This function is invoked per window. + * the merger function - a function that receives the state of + a given partition and the accumulator and merges them together. + * the done function - a function that receives the final accumulator. + + A set of options may also be given to customize with the `:window`, + `:min_demand` and `:max_demand`. + + ## Examples + + For example, imagine we are counting words in a document. Each + partition ends up with a map of words as keys and count as values. + In the examples in the module documentation, we streamed those + results to a single client using `Enum.to_list/1`. However, we + could use `departition/1` to reduce the data over multiple stages + returning one single map with all results: + + File.stream!("path/to/some/file") + |> Flow.from_enumerable() + |> Flow.map(&String.split/1) + |> Flow.partition() + |> Flow.reduce(fn -> %{} end, fn event, acc -> Map.update(acc, event, 1, & &1 + 1) end) + |> Flow.departition(&Map.new/0, &Map.merge/2, &(&1)) + |> Enum.to_list + + The departition function expects the initial accumulator, a function + that merges the data, and a final function invoked when the computation + is done. + + Departition also works with windows and triggers. A new accumulator + is created per window and the merge function is invoked with the state + every time a trigger is emitted in any of the partitions. This can be + useful to compute the final state as computations happen instead of one + time at the end. For example, we could change the flow above so each + partition emits their whole intermediary state every 1000 items, merging + it into the departition more frequently: + + File.stream!("path/to/some/file") + |> Flow.from_enumerable() + |> Flow.map(&String.split/1) + |> Flow.partition(window: Flow.Window.global |> Flow.Window.trigger_every(1000, :reset)) + |> Flow.reduce(fn -> %{} end, fn event, acc -> Map.update(acc, event, 1, & &1 + 1) end) + |> Flow.departition(&Map.new/0, &Map.merge(&1, &2, fn _, v1, v2 -> v1 + v2 end), &(&1)) + |> Enum.to_list + + Each approach is going to have different performance characteristics + and it is important to measure to verify which one will be more efficient + to the problem at hand. + """ + def departition(%Flow{} = flow, acc_fun, merge_fun, done_fun, options \\ []) + when is_function(acc_fun, 0) and is_function(merge_fun, 2) and + (is_function(done_fun, 1) or is_function(done_fun, 2)) do + unless has_reduce?(flow) do + raise ArgumentError, "departition/5 must be called after a group_by/reduce operation" + end + + done_fun = + if is_function(done_fun, 1) do + fn acc, _ -> done_fun.(acc) end + else + done_fun + end + + flow = map_state(flow, fn state, {partition, _}, trigger -> + [{state, partition, trigger}] + end) + + {window, options} = + options + |> Keyword.put(:dispatcher, GenStage.DemandDispatcher) + |> Keyword.put(:stages, 1) + |> Keyword.pop(:window, Flow.Window.global) + + %Flow{producers: {:departition, flow, acc_fun, merge_fun, done_fun}, + options: options, window: window} + end + + @doc """ + Merges the given flows into a new partition with the given + window and options. + + Similar to `partition/2`, this function will partition + the data, routing events with the same characteristics + to the same partition. + + It accepts the same options and hash shortcuts as + `partition/2`. See `partition/2` for more information. + + ## Examples + + Flow.merge([flow1, flow2], window: Flow.Global.window) + Flow.merge([flow1, flow2], stages: 4) + + """ + @spec merge([t], keyword()) :: t + def merge(flows, options \\ []) + + def merge([%Flow{} | _] = flows, options) when is_list(options) do + options = stages(options) + {window, options} = Keyword.pop(options, :window, Flow.Window.global) + %Flow{producers: {:flows, flows}, options: options, window: window} + end + def merge(other, options) when is_list(options) do + raise ArgumentError, "Flow.merge/2 expects a non-empty list of flows as first argument, got: #{inspect other}" + end + + defp stages(options) do + case Keyword.fetch(options, :stages) do + {:ok, _} -> + options + :error -> + stages = System.schedulers_online() + [stages: stages] ++ options + end + end + + @doc """ + Reduces the given values with the given accumulator. + + `acc_fun` is a function that receives no arguments and returns + the actual accumulator. The `acc_fun` function is invoked per window + whenever a new window starts. If a trigger is emitted and it is + configured to reset the accumulator, the `acc_fun` function will + be invoked once again. + + Reducing will accumulate data until a trigger is emitted + or until a window completes. When that happens, the returned + accumulator will be the new state of the stage and all functions + after reduce will be invoked. + + ## Examples + + iex> flow = Flow.from_enumerable(["the quick brown fox"]) |> Flow.flat_map(fn word -> + ...> String.graphemes(word) + ...> end) + iex> flow = flow |> Flow.partition |> Flow.reduce(fn -> %{} end, fn grapheme, map -> + ...> Map.update(map, grapheme, 1, & &1 + 1) + ...> end) + iex> Enum.sort(flow) + [{" ", 3}, {"b", 1}, {"c", 1}, {"e", 1}, {"f", 1}, + {"h", 1}, {"i", 1}, {"k", 1}, {"n", 1}, {"o", 2}, + {"q", 1}, {"r", 1}, {"t", 1}, {"u", 1}, {"w", 1}, + {"x", 1}] + + """ + @spec reduce(t, (() -> acc), (term, acc -> acc)) :: t when acc: term() + def reduce(flow, acc_fun, reducer_fun) when is_function(reducer_fun, 2) do + cond do + has_reduce?(flow) -> + raise ArgumentError, "cannot call group_by/reduce on a flow after another group_by/reduce operation " <> + "(it must be called only once per partition, consider using map_state/2 instead)" + is_function(acc_fun, 0) -> + add_operation(flow, {:reduce, acc_fun, reducer_fun}) + true -> + raise ArgumentError, "Flow.reduce/3 expects the accumulator to be given as a function" + end + end + + @doc """ + Takes `n` events according to the sort function. + + This function allows developers to calculate the top `n` entries + (or the bottom `n` entries) by performing most of the work + concurrently. + + First `n` events are taken from every partition and then those `n` + events from every partition are merged into a single partition. The + final result is a flow with a single partition that will emit a list + with the top `n` events. The sorting is given by the `sort_fun`. + + `take_sort/3` is built on top of departition, which means it will + also take and sort entries across windows. + + ## Examples + + As an example, imagine you are processing a list of URLs and you want + the list of the most accessed URLs. + + iex> urls = ~w(www.foo.com www.bar.com www.foo.com www.foo.com www.baz.com) + iex> flow = urls |> Flow.from_enumerable |> Flow.partition() + iex> flow = flow |> Flow.reduce(fn -> %{} end, fn url, map -> + ...> Map.update(map, url, 1, & &1 + 1) + ...> end) + iex> flow = flow |> Flow.take_sort(1, fn {_url_a, count_a}, {_url_b, count_b} -> + ...> count_b <= count_a + ...> end) + iex> Enum.to_list(flow) + [[{"www.foo.com", 3}]] + + """ + def take_sort(flow, n, sort_fun \\ &<=/2) when is_integer(n) and n > 0 do + unless has_reduce?(flow) do + raise ArgumentError, "take_sort/3 must be called after a group_by/reduce operation" + end + + flow + |> map_state(& &1 |> Enum.sort(sort_fun) |> Enum.take(n)) + |> departition(fn -> [] end, &merge_sorted(&1, &2, n, sort_fun), fn x -> x end) + end + + defp merge_sorted([], other, _, _), do: other + defp merge_sorted(other, [], _, _), do: other + defp merge_sorted(left, right, n, sort), do: merge_sorted(left, right, 0, n, sort) + + defp merge_sorted(_, _, count, count, _sort), do: [] + defp merge_sorted(lefties, [], count, n, _sort), do: Enum.take(lefties, n - count) + defp merge_sorted([], righties, count, n, _sort), do: Enum.take(righties, n - count) + + defp merge_sorted([left | lefties], [right | righties], count, n, sort) do + case sort.(left, right) do + true -> + [left | merge_sorted(lefties, [right | righties], count + 1, n, sort)] + false -> + [right | merge_sorted([left | lefties], righties, count + 1, n, sort)] + end + end + + @doc """ + Groups events with the given `key_fun`. + + This is a reduce operation that groups events into maps + where the key is the key returned by `key_fun` and the + value is a list of values in reverse order as returned by + `value_fun`. The resulting map becomes the stage state. + + ## Examples + + iex> flow = Flow.from_enumerable(~w[the quick brown fox], stages: 1) + iex> flow |> Flow.group_by(&String.length/1) |> Flow.emit(:state) |> Enum.to_list() + [%{3 => ["fox", "the"], 5 => ["brown", "quick"]}] + + """ + @spec group_by(t, (term -> term), (term -> term)) :: t + def group_by(flow, key_fun, value_fun \\ fn x -> x end) + when is_function(key_fun, 1) and is_function(value_fun, 1) do + reduce(flow, fn -> %{} end, fn entry, categories -> + value = value_fun.(entry) + Map.update(categories, key_fun.(entry), [value], &[value | &1]) + end) + end + + @doc """ + Groups a series of `{key, value}` tuples by keys. + + This is a reduce operation that groups events into maps + with the given key and a list of values with the given keys + in reverse order. The resulting map becomes the stage state. + + ## Examples + + iex> flow = Flow.from_enumerable([foo: 1, foo: 2, bar: 3, foo: 4, bar: 5], stages: 1) + iex> flow |> Flow.group_by_key |> Flow.emit(:state) |> Enum.to_list() + [%{foo: [4, 2, 1], bar: [5, 3]}] + + """ + @spec group_by_key(t) :: t + def group_by_key(flow) do + reduce(flow, fn -> %{} end, fn {key, value}, acc -> + Map.update(acc, key, [value], &[value | &1]) + end) + end + + @doc """ + Only emit unique events. + + Calling this function is equivalent to: + + Flow.uniq_by(flow, & &1) + + See `uniq_by/2` for more information. + """ + def uniq(flow) do + uniq_by(flow, & &1) + end + + @doc """ + Only emit events that are unique according to the `by` function. + + In order to verify if an item is unique or not, `uniq_by/2` + must store the value computed by `by/1` into a set. This means + that, when working with unbounded data, it is recommended to + wrap `uniq_by/2` in a window otherwise the data set will grow + forever, eventually using all memory available. + + Also keep in mind that `uniq_by/2` is applied per partition. + Therefore, if the data is not uniquely divided per partition, + it won't be able to calculate the unique items properly. + + ## Examples + + To get started, let's create a flow that emits only the first + odd and even number for a range: + + iex> flow = Flow.from_enumerable(1..100) + iex> flow = Flow.partition(flow, stages: 1) + iex> flow |> Flow.uniq_by(&rem(&1, 2)) |> Enum.sort() + [1, 2] + + Since we have used only one stage when partitioning, we + correctly calculate `[1, 2]` for the given partition. Let's see + what happens when we increase the number of stages in the partition: + + iex> flow = Flow.from_enumerable(1..100) + iex> flow = Flow.partition(flow, stages: 4) + iex> flow |> Flow.uniq_by(&rem(&1, 2)) |> Enum.sort() + [1, 2, 3, 4, 10, 16, 23, 39] + + Now we got 8 numbers, one odd and one even *per partition*. If + we want to compute the unique items per partition, we must properly + hash the events into two distinct partitions, one for odd numbers + and another for even numbers: + + iex> flow = Flow.from_enumerable(1..100) + iex> flow = Flow.partition(flow, stages: 2, hash: fn event -> {event, rem(event, 2)} end) + iex> flow |> Flow.uniq_by(&rem(&1, 2)) |> Enum.sort() + [1, 2] + """ + @spec uniq_by(t, (term -> term)) :: t + def uniq_by(flow, by) when is_function(by, 1) do + add_operation(flow, {:uniq, by}) + end + + @doc """ + Controls which values should be emitted from now. + + The argument can be either `:events`, `:state` or `:nothing`. + This step must be called after the reduce operation and it will + guarantee the state is a list that can be sent downstream. + + Most commonly `:events` is used and each partition will emit the events it has + processed to the next stages. However, sometimes we want + to emit counters or other data structures as a result of + our computations. In such cases, the emit argument can be + set to `:state`, to return the `:state` from `reduce/3` + or `map_state/2` or even the processed collection as a whole. The + argument value of `:nothing` is used by `run/1` and `start_link/2`. + """ + @spec emit(t, :events | :state | :nothing) :: t | Enumerable.t + def emit(flow, :events) do + flow + end + def emit(flow, :state) do + unless has_reduce?(flow) do + raise ArgumentError, "emit/2 must be called after a group_by/reduce operation" + end + map_state(flow, fn acc, _, _ -> [acc] end) + end + def emit(%{operations: operations} = flow, :nothing) do + case inject_to_nothing(operations) do + :map_state -> map_state(flow, fn _, _, _ -> [] end) + :reduce -> reduce(flow, fn -> [] end, fn _, acc -> acc end) + end + end + def emit(_, emit) do + raise ArgumentError, "unknown option for emit: #{inspect emit}" + end + + defp inject_to_nothing([{:reduce, _, _} | _]), do: :map_state + defp inject_to_nothing([_ | ops]), do: inject_to_nothing(ops) + defp inject_to_nothing([]), do: :reduce + + @doc """ + Applies the given function over the window state. + + This function must be called after `reduce/3` as it maps over + the state accumulated by `reduce/3`. `map_state/2` is invoked + per window on every stage whenever there is a trigger: this + gives us an understanding of the window data while leveraging + the parallelism between stages. + + ## The mapper function + + The `mapper` function may have arity 1, 2 or 3. + + The first argument is the state. + + The second argument is optional and contains the partition index. + The partition index is a two-element tuple identifying the current + partition and the total number of partitions as the second element. For + example, for a partition with 4 stages, the partition index will be + the values `{0, 4}`, `{1, 4}`, `{2, 4}` and `{3, 4}`. + + The third argument is optional and contains the window-trigger information. + This information is a three-element tuple containing the window name, + the window identifier, and the trigger name. For example, a global window + created with `Flow.Window.global/0` will emit on termination: + + {:global, :global, :done} + + A `Flow.Window.global/0` window with a count trigger created with + `Flow.Window.trigger_every/2` will also emit: + + {:global, :global, {:every, 20}} + + A `Flow.Window.fixed/3` window will emit on done: + + {:fixed, window, :done} + + Where `window` is an integer identifying the timestamp for the window + being triggered. + + The value returned by the `mapper` function is passed forward to the + upcoming flow functions. + + ## Examples + + We can use `map_state/2` to transform the collection after + processing. For example, if we want to count the amount of + unique letters in a sentence, we can partition the data, + then reduce over the unique entries and finally return the + size of each stage, summing it all: + + iex> flow = Flow.from_enumerable(["the quick brown fox"]) |> Flow.flat_map(fn word -> + ...> String.graphemes(word) + ...> end) + iex> flow = Flow.partition(flow) + iex> flow = Flow.reduce(flow, fn -> %{} end, &Map.put(&2, &1, true)) + iex> flow |> Flow.map_state(fn map -> map_size(map) end) |> Flow.emit(:state) |> Enum.sum() + 16 + + """ + @spec map_state(t, (term -> term) | + (term, term -> term) | + (term, term, {Flow.Window.type, Flow.Window.id, Flow.Window.trigger} -> term)) :: t + def map_state(flow, mapper) when is_function(mapper, 3) do + do_map_state(flow, mapper) + end + def map_state(flow, mapper) when is_function(mapper, 2) do + do_map_state(flow, fn acc, index, _ -> mapper.(acc, index) end) + end + def map_state(flow, mapper) when is_function(mapper, 1) do + do_map_state(flow, fn acc, _, _ -> mapper.(acc) end) + end + defp do_map_state(flow, mapper) do + unless has_reduce?(flow) do + raise ArgumentError, "map_state/2 must be called after a group_by/reduce operation" + end + add_operation(flow, {:map_state, mapper}) + end + + @doc """ + Applies the given function over the stage state without changing its value. + + It is similar to `map_state/2` except that the value returned by `mapper` + is ignored. + + iex> parent = self() + iex> flow = Flow.from_enumerable(["the quick brown fox"]) |> Flow.flat_map(fn word -> + ...> String.graphemes(word) + ...> end) + iex> flow = flow |> Flow.partition(stages: 2) |> Flow.reduce(fn -> %{} end, &Map.put(&2, &1, true)) + iex> flow = flow |> Flow.each_state(fn map -> send(parent, map_size(map)) end) + iex> Flow.run(flow) + iex> receive do + ...> 6 -> :ok + ...> end + :ok + iex> receive do + ...> 10 -> :ok + ...> end + :ok + + """ + @spec each_state(t, (term -> term) | + (term, term -> term) | + (term, term, {Flow.Window.type, Flow.Window.id, Flow.Window.trigger} -> term)) :: t + def each_state(flow, mapper) when is_function(mapper, 3) do + do_each_state(flow, fn acc, index, trigger -> mapper.(acc, index, trigger); acc end) + end + def each_state(flow, mapper) when is_function(mapper, 2) do + do_each_state(flow, fn acc, index, _ -> mapper.(acc, index); acc end) + end + def each_state(flow, mapper) when is_function(mapper, 1) do + do_each_state(flow, fn acc, _, _ -> mapper.(acc); acc end) + end + defp do_each_state(flow, mapper) do + unless has_reduce?(flow) do + raise ArgumentError, "each_state/2 must be called after a group_by/reduce operation" + end + add_operation(flow, {:map_state, mapper}) + end + + defp add_operation(%Flow{operations: operations} = flow, operation) do + %{flow | operations: [operation | operations]} + end + defp add_operation(flow, _producers) do + raise ArgumentError, "expected a flow as argument, got: #{inspect flow}" + end + + defp has_reduce?(%{operations: operations}) do + Enum.any?(operations, &match?({:reduce,_, _}, &1)) + end + + defimpl Enumerable do + def reduce(flow, acc, fun) do + case Flow.Coordinator.start(flow, :producer_consumer, [], [demand: :accumulate]) do + {:ok, pid} -> + Flow.Coordinator.stream(pid).(acc, fun) + {:error, reason} -> + exit({reason, {__MODULE__, :reduce, [flow, acc, fun]}}) + end + end + + def count(_flow) do + {:error, __MODULE__} + end + + def member?(_flow, _value) do + {:error, __MODULE__} + end + end +end diff --git a/lib/flow/coordinator.ex b/lib/flow/coordinator.ex new file mode 100644 index 0000000..b60cb7b --- /dev/null +++ b/lib/flow/coordinator.ex @@ -0,0 +1,86 @@ +defmodule Flow.Coordinator do + @moduledoc false + use GenServer + + def start_link(flow, type, consumers, options) do + GenServer.start_link(__MODULE__, {self(), flow, type, consumers, options}, options) + end + + def start(flow, type, consumers, options) do + GenServer.start(__MODULE__, {self(), flow, type, consumers, options}, options) + end + + def stream(pid) do + GenServer.call(pid, :stream, :infinity) + end + + ## Callbacks + + def init({parent, flow, type, consumers, options}) do + Process.flag(:trap_exit, true) + {:ok, sup} = start_supervisor() + start_link = &Supervisor.start_child(sup, [&1, &2, &3]) + type_options = Keyword.take(options, [:dispatcher]) + + {producers, intermediary} = + Flow.Materialize.materialize(flow, start_link, type, type_options) + + demand = Keyword.get(options, :demand, :forward) + producers = Enum.map(producers, &elem(&1, 0)) + + refs = + for {pid, _} <- intermediary do + for consumer <- consumers do + subscribe(consumer, pid) + end + Process.monitor(pid) + end + + for producer <- producers, + demand == :forward do + GenStage.demand(producer, demand) + end + + {:ok, %{supervisor: sup, producers: producers, intermediary: intermediary, + refs: refs, parent_ref: Process.monitor(parent)}} + end + + defp start_supervisor() do + children = [Supervisor.Spec.worker(GenStage, [], restart: :transient)] + Supervisor.start_link(children, strategy: :simple_one_for_one, max_restarts: 0) + end + + defp subscribe({consumer, opts}, producer) when is_list(opts) do + GenStage.sync_subscribe(consumer, [to: producer] ++ opts) + end + defp subscribe(consumer, producer) do + GenStage.sync_subscribe(consumer, [to: producer]) + end + + def handle_call(:stream, _from, %{producers: producers, intermediary: intermediary} = state) do + {:reply, GenStage.stream(intermediary, producers: producers), state} + end + + def handle_cast({:"$demand", demand}, %{producers: producers} = state) do + for producer <- producers, do: GenStage.demand(producer, demand) + {:noreply, state} + end + + def handle_info({:DOWN, ref, _, _, reason}, %{parent_ref: ref} = state) do + {:stop, reason, state} + end + def handle_info({:DOWN, ref, _, _, _}, %{refs: refs} = state) do + case List.delete(refs, ref) do + [] -> {:stop, :normal, state} + refs -> {:noreply, %{state | refs: refs}} + end + end + + def terminate(_reason, %{supervisor: supervisor}) do + ref = Process.monitor(supervisor) + Process.exit(supervisor, :shutdown) + receive do + {:DOWN, ^ref, _, _, _} -> :ok + end + end +end diff --git a/lib/flow/map_reducer.ex b/lib/flow/map_reducer.ex new file mode 100644 index 0000000..89bb2e9 --- /dev/null +++ b/lib/flow/map_reducer.ex @@ -0,0 +1,101 @@ +defmodule Flow.MapReducer do + @moduledoc false + use GenStage + + def init({type, opts, index, trigger, acc, reducer}) do + {type, {%{}, build_status(type, trigger), index, acc.(), reducer}, opts} + end + + def handle_subscribe(:producer, opts, {pid, ref}, {producers, status, index, acc, reducer}) do + opts[:tag] && Process.put(ref, opts[:tag]) + status = producer_status(pid, ref, status) + {:automatic, {Map.put(producers, ref, nil), status, index, acc, reducer}} + end + + def handle_subscribe(:consumer, _, {pid, ref}, {producers, status, index, acc, reducer}) do + status = consumer_status(pid, ref, status) + {:automatic, {producers, status, index, acc, reducer}} + end + + def handle_cancel(_, {_, ref}, {producers, status, index, acc, reducer}) do + status = cancel_status(ref, status) + %{consumers: consumers} = status + + cond do + Map.has_key?(producers, ref) -> + Process.delete(ref) + {events, acc, status} = done_status(status, index, acc, ref) + {:noreply, events, {Map.delete(producers, ref), status, index, acc, reducer}} + consumers == [] -> + {:stop, :normal, {producers, status, index, acc, reducer}} + true -> + {:noreply, [], {producers, status, index, acc, reducer}} + end + end + + def handle_info({:trigger, keep_or_reset, name}, {producers, status, index, acc, reducer}) do + %{trigger: trigger} = status + {events, acc} = trigger.(acc, index, keep_or_reset, name) + {:noreply, events, {producers, status, index, acc, reducer}} + end + def handle_info({{_, ref}, {:producer, state}}, {producers, status, index, acc, reducer}) when state in [:halted, :done] do + {events, acc, status} = done_status(status, index, acc, ref) + {:noreply, events, {producers, status, index, acc, reducer}} + end + def handle_info(_msg, state) do + {:noreply, [], state} + end + + def handle_events(events, {_, ref}, {producers, status, index, acc, reducer}) when is_function(reducer, 4) do + {events, acc} = reducer.(ref, events, acc, index) + {:noreply, events, {producers, status, index, acc, reducer}} + end + def handle_events(events, {_, ref}, {producers, status, index, acc, reducer}) do + {producers, events, acc} = reducer.(producers, ref, events, acc, index) + {:noreply, events, {producers, status, index, acc, reducer}} + end + + ## Helpers + + defp build_status(type, trigger) do + consumers = if type == :consumer, do: nil, else: [] + %{consumers: consumers, producers: %{}, active: [], done?: false, trigger: trigger} + end + + defp producer_status(pid, ref, %{active: active, producers: producers} = status) do + %{status | active: [ref | active], producers: Map.put(producers, ref, pid)} + end + + defp consumer_status(_pid, ref, %{consumers: consumers} = status) do + %{status | consumers: [ref | consumers]} + end + + defp cancel_status(ref, %{consumers: consumers, producers: producers} = status) do + %{status | consumers: consumers && List.delete(consumers, ref), + producers: Map.delete(producers, ref)} + end + + defp done_status(%{active: [], done?: true} = status, _index, acc, _ref) do + {[], acc, status} + end + defp done_status(%{active: active, done?: false, trigger: trigger, + consumers: consumers, producers: producers} = status, + index, acc, ref) do + case List.delete(active, ref) do + [] when active != [] -> + {events, acc} = trigger.(acc, index, :keep, :done) + + if is_list(consumers) do + GenStage.async_notify(self(), {:producer, :done}) + else + for {ref, pid} <- producers do + GenStage.cancel({pid, ref}, :normal, [:noconnect]) + end + end + + {events, acc, %{status | active: [], done?: true}} + active -> + {[], acc, %{status | active: active}} + end + end +end diff --git a/lib/flow/materialize.ex b/lib/flow/materialize.ex new file mode 100644 index 0000000..f1525f8 --- /dev/null +++ b/lib/flow/materialize.ex @@ -0,0 +1,555 @@ +defmodule Flow.Materialize do + @moduledoc false + + @compile :inline_list_funcs + @map_reducer_opts [:buffer_keep, :buffer_size, :dispatcher] + + def materialize(%{producers: nil}, _, _, _) do + raise ArgumentError, "cannot execute a flow without producers, " <> + "please call \"from_enumerable\" or \"from_stage\" accordingly" + end + + def materialize(%{operations: operations, options: options, producers: producers, window: window}, + start_link, type, type_options) do + options = Keyword.merge(type_options, options) + ops = split_operations(operations) + {producers, consumers, ops, window} = start_producers(producers, ops, start_link, window, options) + {producers, start_stages(ops, window, consumers, start_link, type, options)} + end + + ## Helpers + + @doc """ + Splits the flow operations into layers of stages. + """ + def split_operations([]) do + :none + end + def split_operations(operations) do + split_operations(:lists.reverse(operations), :mapper, []) + end + + defp split_operations([{:mapper, _, _} = op | ops], :mapper, acc_ops) do + split_operations(ops, :mapper, [op | acc_ops]) + end + defp split_operations([op | ops], _type, acc_ops) do + split_operations(ops, :reducer, [op | acc_ops]) + end + defp split_operations([], :mapper, ops) do + {:mapper, mapper_ops(ops), :lists.reverse(ops)} + end + defp split_operations([], :reducer, ops) do + ops = :lists.reverse(ops) + {:reducer, reducer_ops(ops), ops} + end + + defp start_stages(:none, window, producers, _start_link, _type, _options) do + if window != Flow.Window.global do + raise ArgumentError, "a window was set but no computation is happening on this partition" + end + producers + end + defp start_stages({_mr, compiled_ops, _ops}, window, producers, start_link, type, opts) do + {acc, reducer, trigger} = window_ops(window, compiled_ops, opts) + {stages, opts} = Keyword.pop(opts, :stages) + {init_opts, subscribe_opts} = Keyword.split(opts, @map_reducer_opts) + init_opts = + case type do + :consumer -> Keyword.drop(init_opts, [:dispatcher]) + _ -> init_opts + end + + for i <- 0..stages-1 do + subscriptions = + for {producer, producer_opts} <- producers do + {producer, [partition: i] ++ Keyword.merge(subscribe_opts, producer_opts)} + end + arg = {type, [subscribe_to: subscriptions] ++ init_opts, {i, stages}, trigger, acc, reducer} + {:ok, pid} = start_link.(Flow.MapReducer, arg, []) + {pid, []} + end + end + + ## Producers + + defp start_producers({:join, kind, left, right, left_key, right_key, join}, + ops, start_link, window, options) do + partitions = Keyword.fetch!(options, :stages) + {left_producers, left_consumers} = start_join(:left, left, left_key, partitions, start_link) + {right_producers, right_consumers} = start_join(:right, right, right_key, partitions, start_link) + {type, {acc, fun, trigger}, ops} = ensure_ops(ops) + + window = + case window do + %{by: by} -> %{window | by: fn x -> by.(elem(x, 1)) end} + %{} -> window + end + + {left_producers ++ right_producers, + left_consumers ++ right_consumers, + {type, join_ops(kind, join, acc, fun, trigger), ops}, + window} + end + defp start_producers({:departition, flow, acc_fun, merge_fun, done_fun}, + ops, start_link, window, options) do + flow = flow.window.__struct__.departition(flow) + {producers, consumers} = materialize(flow, start_link, :producer_consumer, options) + {type, {acc, fun, trigger}, ops} = ensure_ops(ops) + + stages = Keyword.fetch!(flow.options, :stages) + partitions = Enum.to_list(0..stages-1) + + {producers, consumers, + {type, departition_ops(acc, fun, trigger, partitions, acc_fun, merge_fun, done_fun), ops}, + window} + end + defp start_producers({:flows, flows}, ops, start_link, window, options) do + options = partition(options) + {producers, consumers} = + Enum.reduce(flows, {[], []}, fn flow, {producers_acc, consumers_acc} -> + {producers, consumers} = materialize(flow, start_link, :producer_consumer, options) + {producers ++ producers_acc, consumers ++ consumers_acc} + end) + {producers, consumers, ensure_ops(ops), window} + end + defp start_producers({:stages, producers}, ops, _start_link, window, options) do + producers = for producer <- producers, do: {producer, []} + + # If there are no more stages and there is a need for a custom + # dispatcher, we need to wrap the sources in a custom stage. + if Keyword.has_key?(options, :dispatcher) do + {producers, producers, ensure_ops(ops), window} + else + {producers, producers, ops, window} + end + end + defp start_producers({:enumerables, enumerables}, ops, start_link, window, options) do + # options configures all stages before partition, so it effectively + # controls the number of stages consuming the enumerables. + stages = Keyword.fetch!(options, :stages) + + case ops do + {:mapper, _compiled_ops, mapper_ops} when stages < length(enumerables) -> + # Fuse mappers into enumerables if we have more enumerables than stages. + producers = start_enumerables(enumerables, mapper_ops, partition(options), start_link) + {producers, producers, :none, window} + :none -> + # If there are no ops, just start the enumerables with the options. + producers = start_enumerables(enumerables, [], options, start_link) + {producers, producers, :none, window} + _ -> + # Otherwise it is a regular producer consumer with demand dispatcher. + # In this case, options is used by subsequent mapper/reducer stages. + producers = start_enumerables(enumerables, [], [], start_link) + {producers, producers, ops, window} + end + end + + defp start_enumerables(enumerables, ops, opts, start_link) do + opts = [consumers: :permanent, demand: :accumulate] ++ Keyword.take(opts, @map_reducer_opts) + + for enumerable <- enumerables do + stream = + :lists.foldl(fn {:mapper, fun, args}, acc -> + apply(Stream, fun, [acc | args]) + end, enumerable, ops) + {:ok, pid} = start_link.(GenStage.Streamer, {stream, opts}, opts) + {pid, []} + end + end + + defp partition(options) do + stages = Keyword.fetch!(options, :stages) + hash = options[:hash] || hash_by_key(options[:key], stages) + dispatcher_opts = [partitions: 0..stages-1, hash: hash(hash)] + [dispatcher: {GenStage.PartitionDispatcher, dispatcher_opts}] + end + + defp hash(fun) when is_function(fun, 1) do + fun + end + defp hash(other) do + raise ArgumentError, "expected :hash to be a function that receives an event and " <> + "returns a tuple with the event and its partition, got: #{inspect other}" + end + + defp hash_by_key(nil, stages) do + &{&1, :erlang.phash2(&1, stages)} + end + defp hash_by_key({:elem, pos}, stages) when pos >= 0 do + pos = pos + 1 + &{&1, :erlang.phash2(:erlang.element(pos, &1), stages)} + end + defp hash_by_key({:key, key}, stages) do + &{&1, :erlang.phash2(Map.fetch!(&1, key), stages)} + end + defp hash_by_key(fun, stages) when is_function(fun, 1) do + &{&1, :erlang.phash2(fun.(&1), stages)} + end + defp hash_by_key(other, _) do + raise ArgumentError, """ + expected :key to be one of: + + * a function expecting an event and returning a key + * {:elem, pos} when pos >= 0 + * {:key, key} + + instead got: #{inspect other} + """ + end + + defp ensure_ops(:none), + do: {:mapper, mapper_ops([]), []} + defp ensure_ops(ops), + do: ops + + ## Departition + + defp departition_ops(reducer_acc, reducer_fun, reducer_trigger, partitions, acc_fun, merge_fun, done_fun) do + acc = fn -> {reducer_acc.(), %{}} end + + events = fn ref, events, {acc, windows}, index -> + {events, windows} = dispatch_departition(events, windows, partitions, acc_fun, merge_fun, done_fun) + {events, acc} = reducer_fun.(ref, :lists.reverse(events), acc, index) + {events, {acc, windows}} + end + + trigger = fn + {acc, windows}, index, op, {_, _, :done} = name -> + done = + for {window, {_partitions, acc}} <- :lists.sort(:maps.to_list(windows)) do + done_fun.(acc, window) + end + {events, _} = reducer_trigger.(acc, index, op, name) + {done ++ events, {reducer_acc.(), %{}}} + + {acc, windows}, index, op, name -> + {events, acc} = reducer_trigger.(acc, index, op, name) + {events, {acc, windows}} + end + + {acc, events, trigger} + end + + defp dispatch_departition(events, windows, partitions, acc_fun, merge_fun, done_fun) do + :lists.foldl(fn {state, partition, {_, window, name}}, {events, windows} -> + {partitions, acc} = get_window_data(windows, window, partitions, acc_fun) + partitions = remove_partition_on_done(name, partitions, partition) + acc = merge_fun.(state, acc) + case partitions do + [] -> + {[done_fun.(acc, window) | events], Map.delete(windows, window)} + _ -> + {events, Map.put(windows, window, {partitions, acc})} + end + end, {[], windows}, events) + end + + defp remove_partition_on_done(:done, partitions, partition) do + List.delete(partitions, partition) + end + defp remove_partition_on_done(_, partitions, _) do + partitions + end + + defp get_window_data(windows, window, partitions, acc_fun) do + case windows do + %{^window => value} -> value + %{} -> {partitions, acc_fun.()} + end + end + + ## Joins + + defp start_join(side, flow, key_fun, stages, start_link) do + hash = fn event -> + key = key_fun.(event) + {{key, event}, :erlang.phash2(key, stages)} + end + + opts = [dispatcher: {GenStage.PartitionDispatcher, partitions: 0..stages-1, hash: hash}] + {producers, consumers} = materialize(flow, start_link, :producer_consumer, opts) + + {producers, + for {consumer, consumer_opts} <- consumers do + {consumer, [tag: side] ++ consumer_opts} + end} + end + + defp join_ops(kind, join, acc, fun, trigger) do + acc = fn -> {%{}, %{}, acc.()} end + + events = fn ref, events, {left, right, acc}, index -> + {events, left, right} = dispatch_join(events, Process.get(ref), left, right, join, []) + {events, acc} = fun.(ref, events, acc, index) + {events, {left, right, acc}} + end + + ref = make_ref() + + trigger = fn + {left, right, acc}, index, op, {_, _, :done} = name -> + {kind_events, acc} = + case kind do + :inner -> + {[], acc} + :left_outer -> + fun.(ref, left_events(Map.keys(left), Map.keys(right), left, join), acc, index) + :right_outer -> + fun.(ref, right_events(Map.keys(right), Map.keys(left), right, join), acc, index) + :full_outer -> + left_keys = Map.keys(left) + right_keys = Map.keys(right) + {left_events, acc} = fun.(ref, left_events(left_keys, right_keys, left, join), acc, index) + {right_events, acc} = fun.(ref, right_events(right_keys, left_keys, right, join), acc, index) + {left_events ++ right_events, acc} + end + {trigger_events, acc} = trigger.(acc, index, op, name) + {kind_events ++ trigger_events, {left, right, acc}} + {left, right, acc}, index, op, name -> + {events, acc} = trigger.(acc, index, op, name) + {events, {left, right, acc}} + end + + {acc, events, trigger} + end + + defp left_events(left, right, source, join) do + for key <- left -- right, entry <- Map.fetch!(source, key), do: join.(entry, nil) + end + + defp right_events(right, left, source, join) do + for key <- right -- left, entry <- Map.fetch!(source, key), do: join.(nil, entry) + end + + defp dispatch_join([{key, left} | rest], :left, left_acc, right_acc, join, acc) do + acc = + case right_acc do + %{^key => rights} -> + :lists.foldl(fn right, acc -> [join.(left, right) | acc] end, acc, rights) + %{} -> acc + end + left_acc = Map.update(left_acc, key, [left], &[left | &1]) + dispatch_join(rest, :left, left_acc, right_acc, join, acc) + end + defp dispatch_join([{key, right} | rest], :right, left_acc, right_acc, join, acc) do + acc = + case left_acc do + %{^key => lefties} -> + :lists.foldl(fn left, acc -> [join.(left, right) | acc] end, acc, lefties) + %{} -> acc + end + right_acc = Map.update(right_acc, key, [right], &[right | &1]) + dispatch_join(rest, :right, left_acc, right_acc, join, acc) + end + defp dispatch_join([], _, left_acc, right_acc, _join, acc) do + {:lists.reverse(acc), left_acc, right_acc} + end + + ## Windows + + defp window_ops(%{trigger: trigger, periodically: periodically} = window, + {reducer_acc, reducer_fun, reducer_trigger}, options) do + {window_acc, window_fun, window_trigger} = + window_trigger(trigger, reducer_acc, reducer_fun, reducer_trigger) + {type_acc, type_fun, type_trigger} = + window.__struct__.materialize(window, window_acc, window_fun, window_trigger, options) + {window_periodically(type_acc, periodically), type_fun, type_trigger} + end + + defp window_trigger(nil, reducer_acc, reducer_fun, reducer_trigger) do + {reducer_acc, reducer_fun, reducer_trigger} + end + defp window_trigger({punctuation_acc, punctuation_fun}, + reducer_acc, reducer_fun, reducer_trigger) do + {fn -> {punctuation_acc.(), reducer_acc.()} end, + build_punctuated_reducer(punctuation_fun, reducer_fun, reducer_trigger), + build_punctuated_trigger(reducer_trigger)} + end + + defp build_punctuated_reducer(punctuation_fun, red_fun, trigger) do + fn ref, events, {pun_acc, red_acc}, index, name -> + maybe_punctuate(ref, events, punctuation_fun, pun_acc, red_acc, red_fun, index, name, trigger, []) + end + end + + defp build_punctuated_trigger(trigger) do + fn {trigger_acc, red_acc}, index, op, name -> + {events, red_acc} = trigger.(red_acc, index, op, name) + {events, {trigger_acc, red_acc}} + end + end + + defp maybe_punctuate(ref, events, punctuation_fun, pun_acc, red_acc, + red_fun, index, name, trigger, collected) do + case punctuation_fun.(events, pun_acc) do + {:trigger, trigger_name, pre, op, pos, pun_acc} -> + {red_events, red_acc} = red_fun.(ref, pre, red_acc, index) + {trigger_events, red_acc} = trigger.(red_acc, index, op, put_elem(name, 2, trigger_name)) + maybe_punctuate(ref, pos, punctuation_fun, pun_acc, red_acc, + red_fun, index, name, trigger, collected ++ trigger_events ++ red_events) + {:cont, pun_acc} -> + {red_events, red_acc} = red_fun.(ref, events, red_acc, index) + {collected ++ red_events, {pun_acc, red_acc}} + end + end + + defp window_periodically(window_acc, []) do + window_acc + end + defp window_periodically(window_acc, periodically) do + fn -> + for {time, keep_or_reset, name} <- periodically do + {:ok, _} = :timer.send_interval(time, self(), {:trigger, keep_or_reset, name}) + end + window_acc.() + end + end + + ## Reducers + + defp reducer_ops(ops) do + case take_mappers(ops, []) do + {mappers, [{:reduce, reducer_acc, reducer_fun} | ops]} -> + {reducer_acc, build_reducer(mappers, reducer_fun), build_trigger(ops, reducer_acc)} + {mappers, [{:uniq, uniq_by} | ops]} -> + {acc, reducer, trigger} = reducer_ops(ops) + {fn -> {%{}, acc.()} end, + build_uniq_reducer(mappers, reducer, uniq_by), + build_uniq_trigger(trigger)} + {mappers, ops} -> + {fn -> [] end, build_reducer(mappers, &[&1 | &2]), build_trigger(ops, fn -> [] end)} + end + end + + defp build_reducer(mappers, fun) do + reducer = :lists.foldl(&mapper/2, fun, mappers) + fn _ref, events, acc, _index -> + {[], :lists.foldl(reducer, acc, events)} + end + end + + @protocol_undefined "if you would like to emit a modified state from flow, like " <> + "a counter or a custom data-structure, please call Flow.emit/2 accordingly" + + defp build_trigger(ops, acc_fun) do + map_states = merge_map_state(ops) + + fn acc, index, op, name -> + events = :lists.foldl(& &1.(&2, index, name), acc, map_states) + + try do + Enum.to_list(events) + rescue + e in Protocol.UndefinedError -> + msg = @protocol_undefined + + e = update_in e.description, fn + "" -> msg + dc -> dc <> " (#{msg})" + end + + reraise e, System.stacktrace + else + events -> + case op do + :keep -> {events, acc} + :reset -> {events, acc_fun.()} + end + end + end + end + + defp build_uniq_reducer(mappers, reducer, uniq_by) do + uniq_by = :lists.foldl(&mapper/2, uniq_by_reducer(uniq_by), mappers) + fn ref, events, {set, acc}, index -> + {set, events} = :lists.foldl(uniq_by, {set, []}, events) + {events, acc} = reducer.(ref, :lists.reverse(events), acc, index) + {events, {set, acc}} + end + end + + defp uniq_by_reducer(uniq_by) do + fn event, {set, acc} -> + key = uniq_by.(event) + case set do + %{^key => true} -> {set, acc} + %{} -> {Map.put(set, key, true), [event | acc]} + end + end + end + + defp build_uniq_trigger(trigger) do + fn {set, acc}, index, op, name -> + {events, acc} = trigger.(acc, index, op, name) + {events, {set, acc}} + end + end + + defp merge_map_state(ops) do + case take_mappers(ops, []) do + {[], [{:map_state, fun} | ops]} -> + [fun | merge_map_state(ops)] + {[], [{:uniq, by} | ops]} -> + [fn acc, _, _ -> Enum.uniq_by(acc, by) end | merge_map_state(ops)] + {[], []} -> + [] + {mappers, ops} -> + reducer = :lists.foldl(&mapper/2, &[&1 | &2], mappers) + [fn old_acc, _, _ -> Enum.reduce(old_acc, [], reducer) end | merge_map_state(ops)] + end + end + + ## Mappers + + defp mapper_ops(ops) do + reducer = :lists.foldl(&mapper/2, &[&1 | &2], ops) + {fn -> [] end, + fn _ref, events, [], _index -> {:lists.reverse(:lists.foldl(reducer, [], events)), []} end, + fn _acc, _index, _op, _trigger -> {[], []} end} + end + + defp mapper({:mapper, :each, [each]}, fun) do + fn x, acc -> each.(x); fun.(x, acc) end + end + defp mapper({:mapper, :filter, [filter]}, fun) do + fn x, acc -> + if filter.(x) do + fun.(x, acc) + else + acc + end + end + end + defp mapper({:mapper, :filter_map, [filter, mapper]}, fun) do + fn x, acc -> + if filter.(x) do + fun.(mapper.(x), acc) + else + acc + end + end + end + defp mapper({:mapper, :flat_map, [flat_mapper]}, fun) do + fn x, acc -> + Enum.reduce(flat_mapper.(x), acc, fun) + end + end + defp mapper({:mapper, :map, [mapper]}, fun) do + fn x, acc -> fun.(mapper.(x), acc) end + end + defp mapper({:mapper, :reject, [filter]}, fun) do + fn x, acc -> + if filter.(x) do + acc + else + fun.(x, acc) + end + end + end + + defp take_mappers([{:mapper, _, _} = mapper | ops], acc), + do: take_mappers(ops, [mapper | acc]) + defp take_mappers(ops, acc), + do: {acc, ops} +end diff --git a/lib/flow/window.ex b/lib/flow/window.ex new file mode 100644 index 0000000..ce3eecf --- /dev/null +++ b/lib/flow/window.ex @@ -0,0 +1,573 @@ +defmodule Flow.Window do + @moduledoc """ + Splits a flow into windows that are materialized at certain triggers. + + Windows allow developers to split data so we can understand incoming + data as time progresses. Once a window is created, we can specify + triggers that allow us to customize when the data accumulated on every + window is materialized. + + Windows must be created by calling one of the window type functions. + There are currently two window types: + + * Global windows - that's the default window which means all data + belongs to one single window. In other words, the data is not + split in any way. The window finishes when all producers notify + there is no more data + + * Fixed windows - splits incoming events into periodic, non- + overlapping windows based on event times. In other words, a given + event belongs to a single window. If data arrives late, a configured + lateness can be specified. + + * Periodic windows - splits incoming events into periodic, non- + overlapping windows based on processing times. Similar to fixed + windows, a given event belongs to a single window. + + * Count windows - splits incoming events based on a count. + Similar to fixed windows, a given event belongs to a single + window. + + * Session windows - splits incoming events into unique windows + which is grouped until there is a configured gap between event + times. Sessions are useful for data that is irregularly + distributed with respect to time. + + We discuss all types and include examples below. In the first section, + "Global windows", we build the basic intuition about windows and triggers + as well as discuss the distinction between "Event time and processing time". + Then we explore "Fixed windows" and the concept of lateness before moving + on to other window types. + + ## Global windows + + By default, all events belong to the global window. The global window + is automatically attached to a partition if no window is specified. + The flow below: + + Flow.from_stage(some_producer) + |> Flow.partition() + |> Flow.reduce(fn -> 0 end, & &1 + 2) + + is equivalent to: + + Flow.from_stage(some_producer) + |> Flow.partition(Flow.Window.global()) + |> Flow.reduce(fn -> 0 end, & &1 + 2) + + Even though the global window does not split the data in any way, it + already provides conveniences for working with both bounded (finite) + and unbounded (infinite) via triggers. + + For example, the flow below uses a global window with a count-based + trigger to emit the values being summed as we sum them: + + iex> window = Flow.Window.global |> Flow.Window.trigger_every(10) + iex> flow = Flow.from_enumerable(1..100) |> Flow.partition(window: window, stages: 1) + iex> flow |> Flow.reduce(fn -> 0 end, & &1 + &2) |> Flow.emit(:state) |> Enum.to_list() + [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + + Let's explore the types of triggers available next. + + ### Triggers + + Triggers allow us to check point the data processed so far. There + are different triggers we can use: + + * Event count triggers - compute state operations every X events + + * Processing time triggers - compute state operations every X time + units for every stage + + * Punctuation - hand-written triggers based on the data + + Flow supports the triggers above via the `trigger_every/3`, + `trigger_periodically/4` and `trigger/3` respectively. + + Once a trigger is emitted, the `reduce/3` step halts and invokes + the remaining steps for that flow such as `map_state/2` or any other + call after `reduce/3`. Triggers are also named and the trigger names + will be sent alongside the window name as third argument to the callback + given to `map_state/2` and `each_state/2`. + + For every emitted trigger, developers have the choice of either + resetting the reducer accumulator (`:reset`) or keeping it as is (`:keep`). + The resetting option is useful when you are interested only on intermediate + results, usually because another step is aggregating the data. Keeping the + accumulator is the default and used to checkpoint the values while still + working towards an end result. + + ### Event time and processing time + + Before we move to other window types, it is important to discuss + the distinction between event time and processing time. In particular, + triggers created with the `trigger_periodically/4` function are + intrinsically inaccurate and therefore should not be used to split the + data. For example, if you are measuring the frequency that events arrive, + using the event time will always yield the same result, while processing + time will be vulnerable to fluctuations if, for instance, an external + factor causes events to processed slower or faster than usual. + + Furthermore, periodic triggers are established per partition and are + message-based, which means partitions will emit the triggers at different + times and possibly with delays based on the partition message queue size. + However, it is exactly this lack of precision which makes them efficient + for checkpointing data. + + Flow provides other window types, such as fixed windows, exactly to address + the issues with processing time. Such windows use the event time which is + based on the data itself. When working with event time, we can assign the + data into proper windows even when late or out of order. Such windows can + be used to gather time-based insight from the data (for example, the most + popular hashtags in the last 10 minutes) as well as for checkpointing data. + + ## Fixed windows (event time) + + Fixed windows group the data based on the event times. Regardless if + the data is bounded or not, fixed windows give us time-based insight + about the data. + + Fixed windows are created via the `fixed/3` function which specified + the duration of the window and a function that retrieves the event time + from each event: + + Flow.Window.fixed(1, :hour, fn {word, timestamp} -> timestamp end) + + Let's see an example that will use the window above to count the frequency + of words based on windows that are 1 hour long. The timestamps used by + Flow are integers in milliseconds. For now, we will also set the concurrency + down 1 and max demand down to 5 as it is simpler to reason about the results: + + iex> data = [{"elixir", 0}, {"elixir", 1_000}, {"erlang", 60_000}, + ...> {"concurrency", 3_200_000}, {"elixir", 4_000_000}, + ...> {"erlang", 5_000_000}, {"erlang", 6_000_000}] + iex> window = Flow.Window.fixed(1, :hour, fn {_word, timestamp} -> timestamp end) + iex> flow = Flow.from_enumerable(data, max_demand: 5, stages: 1) + iex> flow = Flow.partition(flow, window: window, stages: 1) + iex> flow = Flow.reduce(flow, fn -> %{} end, fn {word, _}, acc -> + ...> Map.update(acc, word, 1, & &1 + 1) + ...> end) + iex> flow |> Flow.emit(:state) |> Enum.to_list + [%{"elixir" => 2, "erlang" => 1, "concurrency" => 1}, + %{"elixir" => 1, "erlang" => 2}] + + Since the data has been broken in two windows, the first four events belong + to the same window while the last 3 belongs to the second one. Notice that + `reduce/3` is executed per window and that each event belongs to a single + window exclusively. + + Similar to global windows, fixed windows can also have triggers, allowing + us to checkpoint the data as the computation happens. + + ### Data ordering, watermarks and lateness + + When working with event time, Flow assumes by default that events are time + ordered. This means that, when we move from one window to another, like + when we received the entry `{"elixir", 4_000_000}` in the example above, + we assume the previous window has been completed. + + Let's change the events above to be out of order and move the first event + to the end of the dataset and see what happens: + + iex> data = [{"elixir", 1_000}, {"erlang", 60_000}, + ...> {"concurrency", 3_200_000}, {"elixir", 4_000_000}, + ...> {"erlang", 5_000_000}, {"erlang", 6_000_000}, {"elixir", 0}] + iex> window = Flow.Window.fixed(1, :hour, fn {_word, timestamp} -> timestamp end) + iex> flow = Flow.from_enumerable(data) |> Flow.partition(window: window, stages: 1, max_demand: 5) + iex> flow = Flow.reduce(flow, fn -> %{} end, fn {word, _}, acc -> + ...> Map.update(acc, word, 1, & &1 + 1) + ...> end) + iex> flow |> Flow.emit(:state) |> Enum.to_list + [%{"elixir" => 1, "erlang" => 1, "concurrency" => 1}, + %{"elixir" => 1, "erlang" => 2}] + + Notice that now the first map did not count the "elixir" word twice. + Since the event arrived late, it was marked as lost. However, in many + flows we actually expect data to arrive late or out of order, especially + when talking about concurrent data processing. + + Luckily, event time windows include the concept of lateness, which is a + processing time base period we would wait to receive late events. + Let's change the example above once more but now change the window + to also call `allowed_lateness/4`: + + iex> data = [{"elixir", 1_000}, {"erlang", 60_000}, + ...> {"concurrency", 3_200_000}, {"elixir", 4_000_000}, + ...> {"erlang", 5_000_000}, {"erlang", 6_000_000}, {"elixir", 0}] + iex> window = Flow.Window.fixed(1, :hour, fn {_word, timestamp} -> timestamp end) + iex> window = Flow.Window.allowed_lateness(window, 5, :minute) + iex> flow = Flow.from_enumerable(data) |> Flow.partition(window: window, stages: 1, max_demand: 5) + iex> flow = Flow.reduce(flow, fn -> %{} end, fn {word, _}, acc -> + ...> Map.update(acc, word, 1, & &1 + 1) + ...> end) + iex> flow |> Flow.emit(:state) |> Enum.to_list + [%{"concurrency" => 1, "elixir" => 1, "erlang" => 1}, + %{"concurrency" => 1, "elixir" => 2, "erlang" => 1}, + %{"elixir" => 1, "erlang" => 2}] + + Now that we allow late events, we can see the first window emitted + twice. Instead of the window being marked as done when 1 hour passes, + we say it emits a **watermark trigger**. The window will be effectively + done only after the allowed lateness period. If desired, we can use + `Flow.map_state/2` to get more information about each particular window + and its trigger. Replace the last line above by the following: + + flow = flow |> Flow.map_state(fn state, _index, trigger -> {state, trigger} end) + flow = flow |> Flow.emit(:state) |> Enum.to_list() + + The trigger parameter will include the type of window, the current + window and what caused the window to be emitted (`:watermark` or + `:done`). + + ## Periodic windows (processing time) + + Periodic windows are similar to fixed windows except triggers are + emitted based on processing time instead of event time. Remember that + relying on periodic windows or triggers is intrinsically inaccurate and + should not be used to split the data, only as a checkpointing device. + + Periodic windows are also similar to global windows that use + `trigger_periodically/2` to emit events periodically. The difference is + that periodic windows emit a window in a given interval while a trigger + emits a trigger. This behaviour may affect functions such as `Flow.departition/4`, + which calls the `merge` callback per trigger but the `done` callback per + window. Unless you are relying on functions such as `Flow.departition/4`, + there is no distinction between periodic windows and global windows with + periodic triggers. + + ## Count windows (event count) + + Count windows are simpler versions of fixed windows where windows are split + apart by event count. Since it is not timed-based, it does not provide the + concept of lateness. + + iex> window = Flow.Window.count(10) + iex> flow = Flow.from_enumerable(1..100) |> Flow.partition(window: window, stages: 1) + iex> flow |> Flow.reduce(fn -> 0 end, & &1 + &2) |> Flow.emit(:state) |> Enum.to_list() + [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + + Count windows are also similar to global windows that use `trigger_every/2` + to emit events per count. The difference is that count windows emit a + window per event count while a trigger belongs to a window. This behaviour + may affect functions such as `Flow.departition/4`, which calls the `merge` + callback per trigger but the `done` callback per window. Unless you are + relying on functions such as `Flow.departition/4`, there is no distinction + between count windows and global windows with count triggers. + + ## Session windows (event time) + + Session windows are useful for data that is irregularly distributed with + respect to time. For example, GPS data contains moments of user activity + with long periods of user inactivity. Sessions allows us to group these + events together until there is a time gap between them. + + Session windows by definition belong to a single key. Therefore, the :key + option must be given to the partition alongside the window option. For + instance, in case of GPS data, the key would be the `device_id` or the + `user_id`. + + To build on this example, imagine we want to calculate the distance + travelled by a user on certain trips based on GPS data. Let's assume the + movement happens on a one-dimensional line for simplicity. Our server + will receive streaming data from different users in the shape of: + + {user_id, position, time_in_seconds} + + Our code is going to calculate the location per user per trip based on + time inactivity: + + iex> data = [{1, 32, 0}, {1, 35, 60}, {1, 40, 120}, # user 1 - trip 1 + ...> {2, 45, 60}, {2, 43, 70}, {2, 47, 200}, # user 2 - trip 1 + ...> {1, 40, 3600}, {1, 43, 3700}, {1, 50, 4000}] # user 1 - trip 2 + iex> key = fn {user_id, _position, _time} -> user_id end # Partition per user + iex> window = Flow.Window.session(20, :minute, fn {_user_id, _position, time} -> time * 1000 end) + iex> flow = Flow.from_enumerable(data) |> Flow.partition(key: key, window: window) + iex> flow = Flow.reduce(flow, fn -> :empty end, fn + ...> {_, pos, _}, :empty -> {pos, 0} # initial point and distance + ...> {_, pos, _}, {last, distance} -> {pos, abs(pos - last) + distance} + ...> end) + iex> flow = Flow.map_state(flow, fn {_, distance}, _partition, {:session, {user_id, start, last}, :done} -> + ...> {user_id, distance, div(last - start, 1000)} # user_id travelled total in last - start seconds + ...> end) + iex> flow |> Flow.emit(:state) |> Enum.sort() + [{1, 8, 120}, {1, 10, 400}, {2, 6, 140}] + """ + + @type t :: %{required(:trigger) => {fun(), fun()} | nil, + required(:periodically) => []} + + @typedoc "The supported window types." + @type type :: :global | :fixed | :session | :periodic | :count | any() + + @typedoc """ + A function that retrieves the field to window by. + + It must be an integer representing the time in milliseconds. + Flow does not care if the integer is using the UNIX epoch, + Gregorian epoch or any other as long as it is consistent. + """ + @type by :: (term -> non_neg_integer) + + @typedoc """ + The window identifier. + + It is `:global` for `:global` windows. An integer for fixed + windows and a custom value for session windows. + """ + @type id :: :global | non_neg_integer() | term() + + @typedoc "The name of the trigger." + @type trigger :: term + + @typedoc "The operation to perform on the accumulator." + @type accumulator :: :keep | :reset + + @trigger_operation [:keep, :reset] + + @doc """ + Returns a global window. + + Global window triggers have the shape of `{:global, :global, trigger_name}`. + + See the section on "Global windows" in the module documentation for examples. + """ + @spec global :: t + def global do + %Flow.Window.Global{} + end + + @doc """ + Returns a count-based window of every `count` elements. + + `count` must be a positive integer. + + Count window triggers have the shape of `{:count, window, trigger_name}`, + where `window` is an incrementing integer identifying the window. + + See the section on "Count windows" in the module documentation for examples. + """ + @spec count(pos_integer) :: t + def count(count) when is_integer(count) and count > 0 do + %Flow.Window.Count{count: count} + end + + @doc """ + Returns a periodic-based window on every `count` `unit`. + + `count` is a positive integer and `unit` is one of `:millisecond`, + `:second`, `:minute`, `:hour`. Remember periodic triggers are established + per partition and are message-based, which means partitions will emit the + triggers at different times and possibly with delays based on the partition + message queue size. + + Periodic window triggers have the shape of `{:periodic, window, trigger_name}`, + where `window` is an incrementing integer identifying the window. + + See the section on "Periodic windows" in the module documentation for examples. + """ + @spec periodic(pos_integer, System.time_unit) :: t + def periodic(count, unit) when is_integer(count) and count > 0 do + %Flow.Window.Periodic{duration: to_ms(count, unit)} + end + + @doc """ + Returns a fixed window of duration `count` `unit` where the + event time is calculated by the given function `by`. + + `count` is a positive integer and `unit` is one of `:millisecond`, + `:second`, `:minute`, `:hour`. + + Fixed window triggers have the shape of `{:fixed, window, trigger_name}`, + where `window` is an integer that represents the beginning timestamp + for the current window. + + If `allowed_lateness/4` is used with fixed windows, the window will + first emit a `{:fixed, window, :watermark}` trigger when the window + terminates and emit `{:fixed, window, :done}` only after the + `allowed_lateness/4` duration has passed. + + See the section on "Fixed windows" in the module documentation for examples. + """ + @spec fixed(pos_integer, System.time_unit, (t -> pos_integer)) :: t + def fixed(count, unit, by) when is_integer(count) and count > 0 and is_function(by, 1) do + %Flow.Window.Fixed{duration: to_ms(count, unit), by: by} + end + + @doc """ + Returns a session window that works on gaps given by `count` `unit` and + the event time is calculated by the given function `by`. + + `count` is a positive integer and `unit` is one of `:millisecond`, + `:second`, `:minute`, `:hour`. + + Session window triggers have the shape of + `{:session, {key, first_time, last_time}, trigger_name}`, where `key` + is the window key, the `first_time` in the session and the `last_time` + on the session thus far. + + See the section on "Session windows" in the module documentation for examples. + """ + @spec session(pos_integer, System.time_unit, (t -> pos_integer)) :: t + def session(count, unit, by) when is_integer(count) and count > 0 and is_function(by, 1) do + %Flow.Window.Session{gap: to_ms(count, unit), by: by} + end + + @doc """ + Sets a duration, in processing time, of how long we will + wait for late events for a given window. + + If allowed lateness is configured, once the window is finished, + it won't trigger a `:done` event but instead emit a `:watermark`. + The `keep_or_reset` option can configure if the state should be + kept or reset when the watermark is triggered. The window will + be done only when the allowed lateness time expires, effectively + emitting the `:done` trigger. + + `count` is a positive number. The `unit` may be a time unit + (`:second`, `:millisecond`, `:second`, `:minute` and `:hour`). + """ + @spec allowed_lateness(t, pos_integer, System.time_unit, :keep | :reset) :: t + def allowed_lateness(window, count, unit, keep_or_reset \\ :keep) + + def allowed_lateness(%{lateness: _} = window, count, unit, keep_or_reset) do + %{window | lateness: {to_ms(count, unit), keep_or_reset}} + end + def allowed_lateness(window, _, _, _) do + raise ArgumentError, "allowed_lateness/4 not supported for window type #{inspect window}" + end + + @doc """ + Calculates when to emit a trigger. + + Triggers are calculated per window and are used to temporarily + halt the window accumulation, typically done with `Flow.reduce/3`, + allowing the next operations to execute before accumulation is + resumed. + + This function expects the trigger accumulator function, which will + be invoked at the beginning of every window, and a trigger function + that receives the current batch of events and its own accumulator. + The trigger function must return one of the two values: + + * `{:cont, acc}` - the reduce operation should continue as usual. + `acc` is the trigger state. + + * `{:trigger, name, pre, operation, pos, acc}` - where `name` is the + trigger `name`, `pre` are the events to be consumed before the trigger, + the `operation` configures the stage should `:keep` the reduce accumulator + or `:reset` it. `pos` controls events to be processed after the trigger + with the `acc` as the new trigger accumulator. + + We recommend looking at the implementation of `trigger_every/3` as + an example of a custom trigger. + """ + @spec trigger(t, (() -> acc), ([event], acc -> cont_tuple | trigger_tuple)) :: t + when cont_tuple: {:cont, acc}, + trigger_tuple: {:trigger, trigger(), pre, accumulator(), pos, acc}, + pre: [event], pos: [event], acc: term(), event: term() + def trigger(window, acc_fun, trigger_fun) do + if is_function(acc_fun, 0) do + add_trigger(window, {acc_fun, trigger_fun}) + else + raise ArgumentError, "Flow.Window.trigger/3 expects the accumulator to be given as a function" + end + end + + @doc """ + A trigger emitted every `count` elements in a window. + + The `keep_or_reset` argument must be one of `:keep` or `:reset`. + If `:keep`, the state accumulated so far on `reduce/3` will be kept, + otherwise discarded. + + The trigger will be named `{:every, count}`. + + ## Examples + + Below is an example that checkpoints the sum from 1 to 100, emitting + a trigger with the state every 10 items. The extra 5050 value at the + end is the trigger emitted because processing is done. + + iex> window = Flow.Window.global |> Flow.Window.trigger_every(10) + iex> flow = Flow.from_enumerable(1..100) |> Flow.partition(window: window, stages: 1) + iex> flow |> Flow.reduce(fn -> 0 end, & &1 + &2) |> Flow.emit(:state) |> Enum.to_list() + [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + + Now let's see an example similar to above except we reset the counter + on every trigger. At the end, the sum of all values is still 5050: + + iex> window = Flow.Window.global |> Flow.Window.trigger_every(10, :reset) + iex> flow = Flow.from_enumerable(1..100) |> Flow.partition(window: window, stages: 1) + iex> flow |> Flow.reduce(fn -> 0 end, & &1 + &2) |> Flow.emit(:state) |> Enum.to_list() + [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + + """ + @spec trigger_every(t, pos_integer, :keep | :reset) :: t + def trigger_every(window, count, keep_or_reset \\ :keep) + when is_integer(count) and count > 0 and keep_or_reset in @trigger_operation do + name = {:every, count} + + trigger(window, fn -> count end, fn events, acc -> + length = length(events) + if length(events) >= acc do + {pre, pos} = Enum.split(events, acc) + {:trigger, name, pre, keep_or_reset, pos, count} + else + {:cont, acc - length} + end + end) + end + + @doc """ + Emits a trigger periodically every `count` `unit`. + + Such trigger will apply to every window that has changed since the last + periodic trigger. + + `count` is a positive integer and `unit` is one of `:millisecond`, + `:second`, `:minute`, `:hour`. Remember periodic triggers are established + per partition and are message-based, which means partitions will emit the + triggers at different times and possibly with delays based on the partition + message queue size. + + The `keep_or_reset` argument must be one of `:keep` or `:reset`. If + `:keep`, the state accumulate so far on `reduce/3` will be kept, otherwise + discarded. + + The trigger will be named `{:periodically, count, unit}`. + + ## Message-based triggers (timers) + + It is also possible to dispatch a trigger by sending a message to + `self()` with the format of `{:trigger, :keep | :reset, name}`. + This is useful for custom triggers and timers. One example is to + send the message when building the accumulator for `reduce/3`. + If `:reset` is used, every time the accumulator is rebuilt, a new + message will be sent. If `:keep` is used and a new timer is necessary, + then `each_state/2` can be called after `reduce/3` to resend it. + + Similar to periodic triggers, message-based triggers will also be + invoked to all windows that have changed since the last trigger. + """ + @spec trigger_periodically(t, pos_integer, System.time_unit, :keep | :reset) :: t + def trigger_periodically(%{periodically: periodically} = window, + count, unit, keep_or_reset \\ :keep) + when is_integer(count) and count > 0 do + periodically = [{to_ms(count, unit), keep_or_reset, {:periodically, count, unit}} | periodically] + %{window | periodically: periodically} + end + + defp to_ms(count, :millisecond), do: count + defp to_ms(count, :second), do: count * 1000 + defp to_ms(count, :minute), do: count * 1000 * 60 + defp to_ms(count, :hour), do: count * 1000 * 60 * 60 + defp to_ms(_count, unit), do: raise ArgumentError, "unknown unit #{inspect unit} (expected :millisecond, :second, :minute or :hour)" + + defp add_trigger(%{trigger: nil} = window, trigger) do + %{window | trigger: trigger} + end + defp add_trigger(%{}, _trigger) do + raise ArgumentError, "Flow.Window.trigger/3 or Flow.Window.trigger_every/3 " <> + "can only be called once per window" + end +end diff --git a/lib/flow/window/count.ex b/lib/flow/window/count.ex new file mode 100644 index 0000000..bd0973d --- /dev/null +++ b/lib/flow/window/count.ex @@ -0,0 +1,66 @@ +defmodule Flow.Window.Count do + @moduledoc false + + @enforce_keys [:count] + defstruct [:count, :trigger, periodically: []] + + def departition(flow) do + flow + end + + def materialize(%{count: max}, reducer_acc, reducer_fun, reducer_trigger, _options) do + acc = + fn -> {0, max, reducer_acc.()} end + + fun = + fn ref, events, {window, count, acc}, index -> + dispatch(events, window, count, [], acc, + ref, index, max, reducer_acc, reducer_fun, reducer_trigger) + end + + trigger = + fn {window, count, acc}, index, op, name -> + {emit, acc} = reducer_trigger.(acc, index, op, {:count, window, name}) + {emit, {window, count, acc}} + end + + {acc, fun, trigger} + end + + defp dispatch([], window, count, emit, acc, _ref, _index, _max, _reducer_acc, _reducer_fun, _reducer_trigger) do + {emit, {window, count, acc}} + end + defp dispatch(events, window, count, emit, acc, ref, index, max, reducer_acc, reducer_fun, reducer_trigger) do + {count, events, rest} = + collect(events, count, []) + {reducer_emit, acc} = + maybe_dispatch(events, acc, ref, index, window, reducer_fun) + {trigger_emit, acc, window, count} = + maybe_trigger(window, count, acc, index, max, reducer_acc, reducer_trigger) + dispatch(rest, window, count, emit ++ reducer_emit ++ trigger_emit, acc, + ref, index, max, reducer_acc, reducer_fun, reducer_trigger) + end + + defp maybe_trigger(window, 0, acc, index, max, reducer_acc, reducer_trigger) do + {trigger_emit, _} = reducer_trigger.(acc, index, :keep, {:count, window, :done}) + {trigger_emit, reducer_acc.(), window + 1, max} + end + defp maybe_trigger(window, count, acc, _index, _max, _reducer_acc, _reducer_trigger) do + {[], acc, window, count} + end + + defp maybe_dispatch([], acc, _ref, _index, _window, _reducer_fun) do + {[], acc} + end + defp maybe_dispatch(events, acc, ref, index, window, reducer_fun) do + if is_function(reducer_fun, 4) do + reducer_fun.(ref, events, acc, index) + else + reducer_fun.(ref, events, acc, index, {:count, window, :placeholder}) + end + end + + defp collect([], count, acc), do: {count, :lists.reverse(acc), []} + defp collect(events, 0, acc), do: {0, :lists.reverse(acc), events} + defp collect([event | events], count, acc), do: collect(events, count - 1, [event | acc]) +end diff --git a/lib/flow/window/fixed.ex b/lib/flow/window/fixed.ex new file mode 100644 index 0000000..3e16f00 --- /dev/null +++ b/lib/flow/window/fixed.ex @@ -0,0 +1,188 @@ +defmodule Flow.Window.Fixed do + @moduledoc false + + @enforce_keys [:by, :duration] + defstruct [:by, :duration, :trigger, lateness: {0, :keep}, periodically: []] + + def departition(flow) do + flow + end + + def materialize(%{by: by, duration: duration, lateness: lateness}, + reducer_acc, reducer_fun, reducer_trigger, _options) do + ref = make_ref() + acc = fn -> {nil, %{}} end + lateness_fun = lateness_fun(lateness, duration, ref, reducer_acc, reducer_trigger) + + # The reducing function works in three stages. + # + # 1. We start processing all events, grouping all events that belong + # to the same window and then reducing them. One of the outcomes + # of this function is the most recent window for a given producer. + # + # 2. Next we store the most recent timestamp for the producer and get + # both mininum and maximum seen windows. + # + # 3. Finally we see which windows have been seen by all producers (min) + # and if we are still missing any producer data (max is nil). We catch + # up the all window to min, emitting triggers for the old windows. + # + fun = + fn producers, ref, events, {all, windows}, index -> + {reducer_emit, recent, windows} = + split_events(events, ref, [], nil, by, duration, Map.fetch!(producers, ref), + windows, index, reducer_acc, reducer_fun, []) + + # Update the latest window for this producer + producers = Map.put(producers, ref, recent) + min_max = producers |> Map.values |> Enum.min_max() + + {trigger_emit, acc} = + emit_trigger_messages(all, min_max, windows, index, lateness_fun) + + {producers, reducer_emit ++ trigger_emit, acc} + end + + trigger = + fn acc, index, op, name -> + handle_trigger(ref, duration, acc, index, op, name, reducer_acc, reducer_trigger) + end + + {acc, fun, trigger} + end + + ## Reducer + + defp split_events([event | events], ref, buffer, current, by, duration, + recent, windows, index, reducer_acc, reducer_fun, emit) do + window = div(by!(by, event), duration) + if is_nil(current) or window === current do + split_events(events, ref, [event | buffer], window, by, duration, recent, + windows, index, reducer_acc, reducer_fun, emit) + else + {emit, recent, windows} = + reduce_events(ref, buffer, current, duration, recent, windows, index, reducer_acc, reducer_fun, emit) + split_events(events, ref, [event], window, by, duration, recent, + windows, index, reducer_acc, reducer_fun, emit) + end + end + defp split_events([], ref, buffer, window, _by, duration, recent, + windows, index, reducer_acc, reducer_fun, emit) do + reduce_events(ref, buffer, window, duration, recent, windows, index, reducer_acc, reducer_fun, emit) + end + + defp reduce_events(_ref, [], _window, _duration, recent, windows, _index, _reducer_acc, _reducer_fun, emit) do + {emit, recent, windows} + end + defp reduce_events(ref, buffer, window, duration, recent, windows, index, reducer_acc, reducer_fun, emit) do + events = :lists.reverse(buffer) + + case recent_window(window, recent, windows, reducer_acc) do + {:ok, window_acc, recent} -> + {new_emit, window_acc} = + if is_function(reducer_fun, 4) do + reducer_fun.(ref, events, window_acc, index) + else + reducer_fun.(ref, events, window_acc, index, {:fixed, window * duration, :placeholder}) + end + {emit ++ new_emit, recent, Map.put(windows, window, window_acc)} + :error -> + {emit, recent, windows} + end + end + + defp recent_window(window, nil, windows, reducer_acc) do + case windows do + %{^window => acc} -> {:ok, acc, window} + %{} -> {:ok, reducer_acc.(), window} + end + end + defp recent_window(window, recent, windows, reducer_acc) do + case windows do + %{^window => acc} -> {:ok, acc, max(window, recent)} + %{} when window >= recent -> {:ok, reducer_acc.(), window} + %{} -> :error + end + end + + defp by!(by, event) do + case by.(event) do + x when is_integer(x) -> x + x -> raise "Flow.Window.fixed/3 expects `by` function to return an integer, " <> + "got #{inspect x} from #{inspect by}" + end + end + + ## Trigger emission + + # We still haven't received from all producers. + defp emit_trigger_messages(old, {_, nil}, windows, _index, _lateness) do + {[], {old, windows}} + end + # We received data from all producers from the first time. + defp emit_trigger_messages(nil, {min, _}, windows, index, lateness) do + emit_trigger_messages(Enum.min(Map.keys(windows)), min, windows, index, lateness, []) + end + # Catch up the old (all) to the new minimum. + defp emit_trigger_messages(old, {min, _}, windows, index, lateness) do + emit_trigger_messages(old, min, windows, index, lateness, []) + end + + defp emit_trigger_messages(new, new, windows, _index, _lateness, emit) do + {emit, {new, windows}} + end + defp emit_trigger_messages(old, new, windows, index, lateness, emit) do + {new_emit, windows} = lateness.(old, windows, index) + emit_trigger_messages(old + 1, new, windows, index, lateness, emit ++ new_emit) + end + + defp lateness_fun({lateness, op}, duration, ref, reducer_acc, reducer_trigger) do + fn window, windows, index -> + acc = Map.get_lazy(windows, window, reducer_acc) + + case lateness do + 0 -> + {emit, _} = reducer_trigger.(acc, index, :keep, {:fixed, window * duration, :done}) + {emit, Map.delete(windows, window)} + _ -> + Process.send_after(self(), {:trigger, :keep, {ref, window}}, lateness) + {emit, window_acc} = reducer_trigger.(acc, index, op, {:fixed, window * duration, :watermark}) + {emit, Map.put(windows, window, window_acc)} + end + end + end + + ## Trigger handling + + # Lateness termination. + def handle_trigger(ref, duration, {current, windows}, index, op, {ref, window}, _acc, trigger) do + case windows do + %{^window => acc} -> + {emit, _window_acc} = trigger.(acc, index, op, {:fixed, window * duration, :done}) + {emit, {current, Map.delete(windows, window)}} + %{} -> + {[], {current, windows}} + end + end + + # Otherwise trigger all windows. + def handle_trigger(_ref, _duration, {current, windows}, _index, _op, _name, _acc, _trigger) + when map_size(windows) == 0 do + {[], {current, windows}} + end + def handle_trigger(_ref, duration, {current, windows}, index, op, name, acc, trigger) do + {min, max} = windows |> Map.keys() |> Enum.min_max() + {emit, windows} = trigger_all(min, max, duration, windows, index, op, name, acc, trigger, []) + {emit, {current, windows}} + end + + defp trigger_all(min, max, _duration, windows, _index, _op, _name, _acc, _trigger, emit) when min > max do + {emit, windows} + end + defp trigger_all(min, max, duration, windows, index, op, name, acc, trigger, emit) do + window_acc = Map.get_lazy(windows, min, acc) + {new_emit, window_acc} = trigger.(window_acc, index, op, {:fixed, min * duration, name}) + windows = Map.put(windows, min, window_acc) + trigger_all(min + 1, max, duration, windows, index, op, name, acc, trigger, emit ++ new_emit) + end +end diff --git a/lib/flow/window/global.ex b/lib/flow/window/global.ex new file mode 100644 index 0000000..9e73bf2 --- /dev/null +++ b/lib/flow/window/global.ex @@ -0,0 +1,30 @@ +defmodule Flow.Window.Global do + @moduledoc false + + @enforce_keys [] + defstruct [:trigger, periodically: []] + + def departition(flow) do + flow + end + + def materialize(_window, reducer_acc, reducer_fun, reducer_trigger, _options) do + acc = reducer_acc + + fun = + if is_function(reducer_fun, 4) do + reducer_fun + else + fn ref, events, acc, index -> + reducer_fun.(ref, events, acc, index, {:global, :global, :placeholder}) + end + end + + trigger = + fn acc, index, op, name -> + reducer_trigger.(acc, index, op, {:global, :global, name}) + end + + {acc, fun, trigger} + end +end diff --git a/lib/flow/window/periodic.ex b/lib/flow/window/periodic.ex new file mode 100644 index 0000000..bf2c818 --- /dev/null +++ b/lib/flow/window/periodic.ex @@ -0,0 +1,49 @@ +defmodule Flow.Window.Periodic do + @moduledoc false + + @enforce_keys [:duration] + defstruct [:duration, :trigger, periodically: []] + + def departition(flow) do + flow + end + + def materialize(%{duration: duration}, reducer_acc, reducer_fun, reducer_trigger, _options) do + ref = make_ref() + acc = + fn -> + send_after(ref, duration) + {0, reducer_acc.()} + end + + fun = + if is_function(reducer_fun, 4) do + fn ref, events, {window, acc}, index -> + {emit, acc} = reducer_fun.(ref, events, acc, index) + {emit, {window, acc}} + end + else + fn ref, events, {window, acc}, index -> + {emit, acc} = reducer_fun.(ref, events, acc, index, {:periodic, window, :placeholder}) + {emit, {window, acc}} + end + end + + trigger = + fn + {window, acc}, index, op, ^ref -> + {emit, _} = reducer_trigger.(acc, index, op, {:periodic, window, :done}) + send_after(ref, duration) + {emit, {window + 1, reducer_acc.()}} + {window, acc}, index, op, name -> + {emit, acc} = reducer_trigger.(acc, index, op, {:periodic, window, name}) + {emit, {window, acc}} + end + + {acc, fun, trigger} + end + + defp send_after(ref, duration) do + Process.send_after(self(), {:trigger, :keep, ref}, duration) + end +end diff --git a/lib/flow/window/session.ex b/lib/flow/window/session.ex new file mode 100644 index 0000000..c201242 --- /dev/null +++ b/lib/flow/window/session.ex @@ -0,0 +1,99 @@ +defmodule Flow.Window.Session do + @moduledoc false + + @enforce_keys [:by] + defstruct [:by, :gap, :trigger, periodically: []] + + def departition(_flow) do + raise ArgumentError, "cannot departition on a session window because each session window has its own data" + end + + def materialize(%{by: by, gap: gap}, reducer_acc, reducer_fun, reducer_trigger, options) do + key = key_to_fun(options[:key]) + acc = fn -> %{} end + + fun = + fn ref, events, windows, index -> + events = annotate(events, key, by) + dispatch(events, [], windows, gap, ref, index, reducer_acc, reducer_fun, reducer_trigger) + end + + trigger = + fn windows, index, op, name -> + trigger(Map.to_list(windows), [], %{}, index, op, name, reducer_trigger) + end + + {acc, fun, trigger} + end + + defp key_to_fun(nil) do + raise ArgumentError, "Flow.Window.session/3 requires the :key option to be set when partitioning" + end + defp key_to_fun({:elem, pos}) when pos >= 0 do + pos = pos + 1 + &:erlang.element(pos, &1) + end + defp key_to_fun({:key, key}) do + &Map.fetch!(&1, key) + end + defp key_to_fun(fun) when is_function(fun, 1) do + fun + end + + defp annotate(events, key, by) do + for event <- events do + {key.(event), by.(event), event} + end + end + + defp dispatch([{key, by, event} | rest], emit, windows, + gap, ref, index, reducer_acc, reducer_fun, reducer_trigger) do + {trigger_emit, first, acc} = get_window(windows, key, by, gap, index, reducer_acc, reducer_trigger) + {events, rest, last} = look_ahead(rest, key, by, gap, [event], []) + {reducer_emit, acc} = + if is_function(reducer_fun, 4) do + reducer_fun.(ref, events, acc, index) + else + reducer_fun.(ref, events, acc, index, {:session, {key, first, last}, :placeholder}) + end + dispatch(rest, emit ++ trigger_emit ++ reducer_emit, Map.put(windows, key, {first, last, acc}), + gap, ref, index, reducer_acc, reducer_fun, reducer_trigger) + end + defp dispatch([], emit, windows, _gap, _ref, _index, _reducer_acc, _reducer_fun, _reducer_trigger) do + {emit, windows} + end + + defp get_window(windows, key, by, gap, index, reducer_acc, reducer_trigger) do + case windows do + %{^key => {first, last, acc}} when by - last > gap -> + {emit, _} = reducer_trigger.(acc, index, :keep, {:session, {key, first, last}, :done}) + {emit, by, reducer_acc.()} + %{^key => {first, _last, acc}} -> + {[], first, acc} + %{} -> + {[], by, reducer_acc.()} + end + end + + defp look_ahead([{key, by, _} | _] = tuples, key, last, gap, events, rest) when by - last > gap do + {:lists.reverse(events), :lists.reverse(rest, tuples), last} + end + defp look_ahead([{key, by, event} | tuples], key, _last, gap, events, rest) do + look_ahead(tuples, key, by, gap, [event | events], rest) + end + defp look_ahead([tuple | tuples], key, last, gap, events, rest) do + look_ahead(tuples, key, last, gap, events, [tuple | rest]) + end + defp look_ahead([], _key, last, _gap, events, rest) do + {:lists.reverse(events), :lists.reverse(rest), last} + end + + defp trigger([{key, {first, last, acc}} | rest], emit, windows, index, op, name, reducer_trigger) do + {trigger_emit, acc} = reducer_trigger.(acc, index, op, {:session, {key, first, last}, name}) + trigger(rest, emit ++ trigger_emit, Map.put(windows, key, {first, last, acc}), + index, op, name, reducer_trigger) + end + defp trigger([], emit, windows, _index, _op, _name, _reducer_trigger) do + {emit, windows} + end +end diff --git a/mix.exs b/mix.exs new file mode 100644 index 0000000..28ab0dd --- /dev/null +++ b/mix.exs @@ -0,0 +1,34 @@ +defmodule Flow.Mixfile do + use Mix.Project + + @version "0.11.0" + + def project do + [app: :flow, + version: @version, + elixir: "~> 1.3", + package: package(), + description: "Computational parallel flows for Elixir", + build_embedded: Mix.env == :prod, + start_permanent: Mix.env == :prod, + deps: deps(), + docs: [main: "Flow", source_ref: "v#{@version}", + source_url: "https://github.com/elixir-lang/flow"]] + end + + def application do + [applications: [:logger]] + end + + defp deps do + [{:gen_stage, github: "elixir-lang/gen_stage"}, + {:ex_doc, "~> 0.12", only: :docs}, + {:inch_ex, ">= 0.4.0", only: :docs}] + end + + defp package do + %{licenses: ["Apache 2"], + maintainers: ["José Valim", "James Fish"], + links: %{"GitHub" => "https://github.com/elixir-lang/flow"}} + end +end diff --git a/mix.lock b/mix.lock new file mode 100644 index 0000000..c264dcb --- /dev/null +++ b/mix.lock @@ -0,0 +1,5 @@ +%{"earmark": {:hex, :earmark, "1.0.3", "89bdbaf2aca8bbb5c97d8b3b55c5dd0cff517ecc78d417e87f1d0982e514557b", [:mix], []}, + "ex_doc": {:hex, :ex_doc, "0.14.5", "c0433c8117e948404d93ca69411dd575ec6be39b47802e81ca8d91017a0cf83c", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]}, + "gen_stage": {:git, "https://github.com/elixir-lang/gen_stage.git", "2f8cf4d1923c1cd35cd54d7630dd87a659031954", []}, + "inch_ex": {:hex, :inch_ex, "0.5.5", "b63f57e281467bd3456461525fdbc9e158c8edbe603da6e3e4671befde796a3d", [:mix], [{:poison, "~> 1.5 or ~> 2.0 or ~> 3.0", [hex: :poison, optional: false]}]}, + "poison": {:hex, :poison, "3.0.0", "625ebd64d33ae2e65201c2c14d6c85c27cc8b68f2d0dd37828fde9c6920dd131", [:mix], []}} diff --git a/test/flow/window/count_test.exs b/test/flow/window/count_test.exs new file mode 100644 index 0000000..fd0a0d6 --- /dev/null +++ b/test/flow/window/count_test.exs @@ -0,0 +1,160 @@ +defmodule Flow.Window.CountTest do + use ExUnit.Case, async: true + + defp single_window do + Flow.Window.count(1000) + end + + describe "single window" do + test "with multiple mappers and reducers" do + assert Flow.from_enumerable(1..100, stages: 4, max_demand: 5) + |> Flow.map(&(&1)) + |> Flow.partition(window: single_window(), stages: 4) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.sum() == 5050 + end + + test "trigger keep with large demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + end + + test "trigger keep with small demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10), stages: 1, max_demand: 5) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + end + + test "trigger discard with large demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10, :reset), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + end + + test "trigger discard with small demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10, :reset), stages: 1, max_demand: 5) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + end + + test "trigger ordering" do + window = + Flow.Window.trigger(single_window(), fn -> true end, fn events, true -> + {:cont, Enum.all?(events, &rem(&1, 2) == 0)} + end) + + assert Flow.from_enumerable(1..10) + |> Flow.partition(window: window, stages: 1) + |> Flow.map(& &1 + 1) + |> Flow.map(& &1 * 2) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.sort() == [130] + end + + test "trigger names" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10, :reset), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:count, 0, trigger} -> {trigger, state} end) + |> Flow.emit(:state) + |> Enum.sort() == [{:done, 0}, + {{:every, 10}, 55}, {{:every, 10}, 155}, + {{:every, 10}, 255}, {{:every, 10}, 355}, + {{:every, 10}, 455}, {{:every, 10}, 555}, + {{:every, 10}, 655}, {{:every, 10}, 755}, + {{:every, 10}, 855}, {{:every, 10}, 955}] + end + + test "trigger based on intervals" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5, stages: 2) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_periodically(100, :millisecond), + stages: 1, max_demand: 10) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(& &1 * 2) + |> Flow.emit(:state) + |> Enum.take(1) == [110] + end + + test "trigger based on timers" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5, stages: 2) + |> Flow.partition(stages: 1, max_demand: 10, window: single_window()) + |> Flow.reduce(fn -> + Process.send_after(self(), {:trigger, :reset, :sample}, 200) + 0 + end, & &1 + &2) + |> Flow.map_state(&{&1 * 2, &2, &3}) + |> Flow.emit(:state) + |> Enum.take(1) == [{110, {0, 1}, {:count, 0, :sample}}] + end + end + + defp double_ordered_window do + Flow.Window.count(50) + end + + describe "double ordered windows" do + test "reduces per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window(), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [1275, 3775, 0] + end + + test "triggers per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_every(12), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:count, count, trigger} -> [{state, count, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {1176, 0, {:every, 12}}, + {1275, 0, :done}, + {678, 1, {:every, 12}}, + {1500, 1, {:every, 12}}, + {2466, 1, {:every, 12}}, + {3576, 1, {:every, 12}}, + {3775, 1, :done}, + {0, 2, :done}] + end + + test "reduces per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window(), stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [1275, 3775, 0] + end + + test "triggers per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_every(12), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:count, count, trigger} -> [{state, count, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {1176, 0, {:every, 12}}, + {1275, 0, :done}, + {678, 1, {:every, 12}}, + {1500, 1, {:every, 12}}, + {2466, 1, {:every, 12}}, + {3576, 1, {:every, 12}}, + {3775, 1, :done}, + {0, 2, :done}] + end + end +end diff --git a/test/flow/window/fixed_test.exs b/test/flow/window/fixed_test.exs new file mode 100644 index 0000000..50ef597 --- /dev/null +++ b/test/flow/window/fixed_test.exs @@ -0,0 +1,395 @@ +defmodule Flow.Window.FixedTest do + use ExUnit.Case, async: true + + defp single_window do + Flow.Window.fixed(1, :second, fn _ -> 0 end) + end + + describe "single window" do + test "with multiple mappers and reducers" do + assert Flow.from_enumerable(1..100, stages: 4, max_demand: 5) + |> Flow.map(&(&1)) + |> Flow.partition(window: single_window(), stages: 4) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.sum() == 5050 + end + + test "trigger keep with large demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + end + + test "trigger keep with small demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10), stages: 1, max_demand: 5) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + end + + test "trigger discard with large demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10, :reset), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + end + + test "trigger discard with small demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10, :reset), stages: 1, max_demand: 5) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + end + + test "trigger ordering" do + window = + Flow.Window.trigger(single_window(), fn -> true end, fn events, true -> + {:cont, Enum.all?(events, &rem(&1, 2) == 0)} + end) + + assert Flow.from_enumerable(1..10) + |> Flow.partition(window: window, stages: 1) + |> Flow.map(& &1 + 1) + |> Flow.map(& &1 * 2) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.sort() == [130] + end + + test "trigger names" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10, :reset), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, 0, trigger} -> {trigger, state} end) + |> Flow.emit(:state) + |> Enum.sort() == [{:done, 0}, + {{:every, 10}, 55}, {{:every, 10}, 155}, + {{:every, 10}, 255}, {{:every, 10}, 355}, + {{:every, 10}, 455}, {{:every, 10}, 555}, + {{:every, 10}, 655}, {{:every, 10}, 755}, + {{:every, 10}, 855}, {{:every, 10}, 955}] + end + + test "trigger based on intervals" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5, stages: 2) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_periodically(100, :millisecond), + stages: 1, max_demand: 10) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(& &1 * 2) + |> Flow.emit(:state) + |> Enum.take(1) == [110] + end + + test "trigger based on timers" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5, stages: 2) + |> Flow.partition(stages: 1, max_demand: 10, window: single_window()) + |> Flow.reduce(fn -> + Process.send_after(self(), {:trigger, :reset, :sample}, 200) + 0 + end, & &1 + &2) + |> Flow.map_state(&{&1 * 2, &2, &3}) + |> Flow.emit(:state) + |> Enum.take(1) == [{110, {0, 1}, {:fixed, 0, :sample}}] + end + end + + defp double_ordered_window do + Flow.Window.fixed(1, :second, fn + x when x <= 50 -> 0 + x + x when x <= 100 -> 1_000 + x + end) + end + + describe "double ordered windows" do + test "reduces per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window(), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [1275, 3775] + end + + test "triggers per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_every(12), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {1176, 0, {:every, 12}}, + {678, 1000, {:every, 12}}, + {1500, 1000, {:every, 12}}, + {2466, 1000, {:every, 12}}, + {3576, 1000, {:every, 12}}, + {1275, 0, :done}, + {3775, 1000, :done}] + end + + test "reduces per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window(), stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [1275, 3775] + end + + test "triggers per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_every(12), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {1176, 0, {:every, 12}}, + {1275, 0, :done}, + {678, 1000, {:every, 12}}, + {1500, 1000, {:every, 12}}, + {2466, 1000, {:every, 12}}, + {3576, 1000, {:every, 12}}, + {3775, 1000, :done}] + end + + test "triggers for all windows" do + assert Flow.from_enumerable(Stream.concat(1..100, Stream.timer(:infinity)), max_demand: 5, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_periodically(100, :millisecond), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.take(2) == [{1275, 0, :done}, + {3775, 1000, {:periodically, 100, :millisecond}}] + end + end + + defp double_unordered_window_without_lateness do + Flow.Window.fixed(1, :second, fn + x when x <= 40 -> 0 + x when x <= 80 -> 2_000 + x when x <= 100 -> 0 # Those events will be lost + end) + end + + # With one stage, termination happens when one stage is done. + describe "double unordered windows without lateness with one stage" do + test "reduces per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_without_lateness(), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [2630, 0, 2420] + end + + test "triggers per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_without_lateness() |> Flow.Window.trigger_every(12), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {558, 2000, {:every, 12}}, + {1260, 2000, {:every, 12}}, + {2106, 2000, {:every, 12}}, + {1496, 0, {:every, 12}}, + {2630, 0, {:every, 12}}, + {2630, 0, :done}, + {0, 1000, :done}, + {2420, 2000, :done}] + end + + test "reduces per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_without_lateness(), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [820, 0, 2420] + end + + test "triggers per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_without_lateness() |> Flow.Window.trigger_every(12), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {820, 0, :done}, + {0, 1000, :done}, + {558, 2000, {:every, 12}}, + {1260, 2000, {:every, 12}}, + {2106, 2000, {:every, 12}}, + {2420, 2000, :done}] + end + + test "triggers for all windows" do + assert Flow.from_enumerable(Stream.concat(1..100, Stream.timer(:infinity)), max_demand: 5, stages: 1) + |> Flow.partition(window: double_unordered_window_without_lateness() |> Flow.Window.trigger_periodically(100, :millisecond), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.take(3) == [{820, 0, :done}, + {0, 1000, :done}, + {2420, 2000, {:periodically, 100, :millisecond}}] + end + end + + # With two stages, termination is only guaranteed once both stages are done. + describe "double unordered windows without lateness with two stages" do + test "reduces per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 2) + |> Flow.partition(window: double_unordered_window_without_lateness(), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [2630, 0, 2420] + end + + test "triggers per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 2) + |> Flow.partition(window: double_unordered_window_without_lateness() |> Flow.Window.trigger_every(12), + stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {558, 2000, {:every, 12}}, + {1260, 2000, {:every, 12}}, + {2106, 2000, {:every, 12}}, + {1496, 0, {:every, 12}}, + {2630, 0, {:every, 12}}, + {2630, 0, :done}, + {0, 1000, :done}, + {2420, 2000, :done}] + end + + test "reduces per window with small demand" do + # We were not suppose to receive all data but, + # because we have two stages, we are only done + # once both stages are done, so we end-up consuming + # late events while the other producer is open. + assert Flow.from_enumerable(1..100, stages: 2) + |> Flow.map(& &1) + |> Flow.partition(window: double_unordered_window_without_lateness(), + stages: 1, max_demand: 100) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [2630, 0, 2420] + end + + test "triggers per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 2) + |> Flow.map(& &1) + |> Flow.partition(window: double_unordered_window_without_lateness() |> Flow.Window.trigger_every(12), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {558, 2000, {:every, 12}}, + {1260, 2000, {:every, 12}}, + {2106, 2000, {:every, 12}}, + {1496, 0, {:every, 12}}, + {2630, 0, {:every, 12}}, + {2630, 0, :done}, + {0, 1000, :done}, + {2420, 2000, :done}] + end + end + + defp double_unordered_window_with_lateness(keep_or_reset \\ :keep) do + Flow.Window.fixed(1, :second, fn + x when x <= 40 -> 0 + x when x <= 80 -> 2_000 + x when x <= 100 -> 0 # Those events won't be lost due to lateness + end) |> Flow.Window.allowed_lateness(1, :hour, keep_or_reset) + end + + # With one stage, termination happens when one stage is done. + describe "double unordered windows with lateness with one stage" do + test "reduces per window with large demand and keep buffer" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_with_lateness(:keep), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [2630, 0, 2630, 0, 2420] + end + + test "reduces per window with large demand and reset buffer" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_with_lateness(:reset), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [2630, 0, 0, 0, 2420] + end + + test "triggers per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_with_lateness() |> Flow.Window.trigger_every(12), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {558, 2000, {:every, 12}}, + {1260, 2000, {:every, 12}}, + {2106, 2000, {:every, 12}}, + {1496, 0, {:every, 12}}, + {2630, 0, {:every, 12}}, + {2630, 0, :watermark}, + {0, 1000, :watermark}, + {2630, 0, :done}, + {0, 1000, :done}, + {2420, 2000, :done}] + end + + test "reduces per window with small demand and keep buffer" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_with_lateness(), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [820, 0, 2630, 0, 2420] + end + + test "reduces per window with small demand and reset buffer" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_with_lateness(:reset), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [820, 0, 1810, 0, 2420] + end + + test "triggers per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_unordered_window_with_lateness() |> Flow.Window.trigger_every(12), + stages: 1, max_demand: 5, min_demand: 0) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:fixed, fixed, trigger} -> [{state, fixed, trigger}] end) + |> Enum.to_list() == [{78, 0, {:every, 12}}, + {300, 0, {:every, 12}}, + {666, 0, {:every, 12}}, + {820, 0, :watermark}, + {0, 1000, :watermark}, + {558, 2000, {:every, 12}}, + {1260, 2000, {:every, 12}}, + {2106, 2000, {:every, 12}}, + {1496, 0, {:every, 12}}, + {2630, 0, {:every, 12}}, + {2630, 0, :done}, + {0, 1000, :done}, + {2420, 2000, :done}] + end + end +end diff --git a/test/flow/window/global_test.exs b/test/flow/window/global_test.exs new file mode 100644 index 0000000..88ea524 --- /dev/null +++ b/test/flow/window/global_test.exs @@ -0,0 +1,87 @@ +defmodule Flow.Window.GlobalTest do + use ExUnit.Case, async: true + + test "trigger keep with large demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: Flow.Window.global |> Flow.Window.trigger_every(10), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + end + + test "trigger keep with small demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: Flow.Window.global |> Flow.Window.trigger_every(10), stages: 1, max_demand: 5) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + end + + test "trigger discard with large demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: Flow.Window.global |> Flow.Window.trigger_every(10, :reset), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + end + + test "trigger discard with small demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: Flow.Window.global |> Flow.Window.trigger_every(10, :reset), + stages: 1, max_demand: 5) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + end + + test "trigger ordering" do + window = + Flow.Window.trigger(Flow.Window.global, fn -> true end, fn events, true -> + {:cont, Enum.all?(events, &rem(&1, 2) == 0)} + end) + + assert Flow.from_enumerable(1..10) + |> Flow.partition(window: window, stages: 1) + |> Flow.map(& &1 + 1) + |> Flow.map(& &1 * 2) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.sort() == [130] + end + + test "trigger names" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: Flow.Window.global |> Flow.Window.trigger_every(10, :reset), stages: 1) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:global, :global, trigger} -> {trigger, state} end) + |> Flow.emit(:state) + |> Enum.sort() == [{:done, 0}, + {{:every, 10}, 55}, {{:every, 10}, 155}, + {{:every, 10}, 255}, {{:every, 10}, 355}, + {{:every, 10}, 455}, {{:every, 10}, 555}, + {{:every, 10}, 655}, {{:every, 10}, 755}, + {{:every, 10}, 855}, {{:every, 10}, 955}] + end + + test "trigger based on intervals" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5) + |> Flow.partition(window: Flow.Window.global |> Flow.Window.trigger_periodically(100, :millisecond), + stages: 1, max_demand: 10) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(& &1 * 2) + |> Flow.emit(:state) + |> Enum.take(1) == [110] + end + + test "trigger based on timers" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5, stages: 2) + |> Flow.partition(stages: 1, max_demand: 10) + |> Flow.reduce(fn -> + Process.send_after(self(), {:trigger, :reset, :sample}, 200) + 0 + end, & &1 + &2) + |> Flow.map_state(&{&1 * 2, &2, &3}) + |> Flow.emit(:state) + |> Enum.take(1) == [{110, {0, 1}, {:global, :global, :sample}}] + end +end diff --git a/test/flow/window/periodic_test.exs b/test/flow/window/periodic_test.exs new file mode 100644 index 0000000..4fbdc19 --- /dev/null +++ b/test/flow/window/periodic_test.exs @@ -0,0 +1,28 @@ +defmodule Flow.Window.PeriodicTest do + use ExUnit.Case, async: true + + defp single_window do + Flow.Window.periodic(100, :millisecond) + end + + test "emits based on intervals" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5) + |> Flow.partition(window: single_window(), stages: 1, max_demand: 10) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, index, {:periodic, window, :done} -> {state, index, window} end) + |> Flow.emit(:state) + |> Enum.take(2) == [{55, {0, 1}, 0}, {0, {0, 1}, 1}] + end + + test "emits based on intervals with count triggers" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5, stages: 2) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(5), + stages: 1, max_demand: 10) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:periodic, window, trigger} -> {state, window, trigger} end) + |> Flow.emit(:state) + |> Enum.take(3) == [{15, 0, {:every, 5}}, + {55, 0, {:every, 5}}, + {55, 0, :done}] + end +end diff --git a/test/flow/window/session_test.exs b/test/flow/window/session_test.exs new file mode 100644 index 0000000..c7294f5 --- /dev/null +++ b/test/flow/window/session_test.exs @@ -0,0 +1,251 @@ +defmodule Flow.Window.SessionTest do + use ExUnit.Case, async: true + + defp single_window do + Flow.Window.session(1, :second, fn x -> x end) + end + + @tag :capture_log + test "can't be departitioned" do + assert catch_exit( + Flow.from_enumerable(1..100, stages: 4, max_demand: 5) + |> Flow.partition(window: single_window(), stages: 4, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.departition(fn -> 0 end, & &1 + &2, &(&1)) + |> Enum.to_list + ) + end + + describe "single window" do + test "with multiple mappers and reducers" do + assert Flow.from_enumerable(1..100, stages: 4, max_demand: 5) + |> Flow.map(&(&1)) + |> Flow.partition(window: single_window(), stages: 4, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.sum() == 5050 + end + + test "trigger keep with large demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10), stages: 1, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + end + + test "trigger keep with small demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10), stages: 1, max_demand: 5, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 210, 465, 820, 1275, 1830, 2485, 3240, 4095, 5050, 5050] + end + + test "trigger discard with large demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10, :reset), stages: 1, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + end + + test "trigger discard with small demand" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10, :reset), stages: 1, max_demand: 5, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [55, 155, 255, 355, 455, 555, 655, 755, 855, 955, 0] + end + + test "trigger ordering" do + window = + Flow.Window.trigger(single_window(), fn -> true end, fn events, true -> + {:cont, Enum.all?(events, &rem(&1, 2) == 0)} + end) + + assert Flow.from_enumerable(1..10) + |> Flow.partition(window: window, stages: 1, key: fn _ -> 0 end) + |> Flow.map(& &1 + 1) + |> Flow.map(& &1 * 2) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.sort() == [130] + end + + test "trigger names" do + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_every(10, :reset), stages: 1, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:session, {0, 1, _}, trigger} -> {trigger, state} end) + |> Flow.emit(:state) + |> Enum.sort() == [{:done, 0}, + {{:every, 10}, 55}, {{:every, 10}, 155}, + {{:every, 10}, 255}, {{:every, 10}, 355}, + {{:every, 10}, 455}, {{:every, 10}, 555}, + {{:every, 10}, 655}, {{:every, 10}, 755}, + {{:every, 10}, 855}, {{:every, 10}, 955}] + end + + test "trigger based on intervals" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5, stages: 2) + |> Flow.partition(window: single_window() |> Flow.Window.trigger_periodically(100, :millisecond), + stages: 1, max_demand: 10, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(& &1 * 2) + |> Flow.emit(:state) + |> Enum.take(1) == [110] + end + + test "trigger based on timers" do + assert Flow.from_enumerable(Stream.concat(1..10, Stream.timer(:infinity)), max_demand: 5, stages: 2) + |> Flow.partition(stages: 1, max_demand: 10, window: single_window(), key: fn _ -> 0 end) + |> Flow.reduce(fn -> + Process.send_after(self(), {:trigger, :reset, :sample}, 200) + 0 + end, & &1 + &2) + |> Flow.map_state(&{&1 * 2, &2, &3}) + |> Flow.emit(:state) + |> Enum.take(1) == [{110, {0, 1}, {:session, {0, 1, 10}, :sample}}] + end + end + + defp double_ordered_window do + Flow.Window.session(1, :second, fn + x when x <= 50 -> 0 + x + x when x <= 100 -> 1_000 + x + end) + end + + describe "double ordered windows with single key" do + test "reduces per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window(), stages: 1, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [1275, 3775] + end + + test "triggers per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_every(12), stages: 1, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:session, session, trigger} -> [{state, session, trigger}] end) + |> Enum.to_list() == [{78, {0, 1, 50}, {:every, 12}}, + {300, {0, 1, 50}, {:every, 12}}, + {666, {0, 1, 50}, {:every, 12}}, + {1176, {0, 1, 50}, {:every, 12}}, + {1275, {0, 1, 50}, :done}, + {678, {0, 1051, 1100}, {:every, 12}}, + {1500, {0, 1051, 1100}, {:every, 12}}, + {2466, {0, 1051, 1100}, {:every, 12}}, + {3576, {0, 1051, 1100}, {:every, 12}}, + {3775, {0, 1051, 1100}, :done}] + end + + test "reduces per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window(), stages: 1, max_demand: 5, min_demand: 0, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [1275, 3775] + end + + test "triggers per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_every(12), + stages: 1, max_demand: 5, min_demand: 0, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:session, session, trigger} -> [{state, session, trigger}] end) + |> Enum.to_list() == [{78, {0, 1, 15}, {:every, 12}}, + {300, {0, 1, 25}, {:every, 12}}, + {666, {0, 1, 40}, {:every, 12}}, + {1176, {0, 1, 50}, {:every, 12}}, + {1275, {0, 1, 50}, :done}, + {678, {0, 1051, 1065}, {:every, 12}}, + {1500, {0, 1051, 1075}, {:every, 12}}, + {2466, {0, 1051, 1090}, {:every, 12}}, + {3576, {0, 1051, 1100}, {:every, 12}}, + {3775, {0, 1051, 1100}, :done}] + end + + test "triggers for all windows" do + assert Flow.from_enumerable(Stream.concat(1..100, Stream.timer(:infinity)), max_demand: 5, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_periodically(100, :millisecond), + stages: 1, max_demand: 5, min_demand: 0, key: fn _ -> 0 end) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:session, session, trigger} -> [{state, session, trigger}] end) + |> Enum.take(2) == [{1275, {0, 1, 50}, :done}, + {3775, {0, 1051, 1100}, {:periodically, 100, :millisecond}}] + end + end + + describe "double ordered windows with multiple keys" do + test "reduces per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window(), stages: 1, key: &rem(&1, 2)) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [625, 650, 1900, 1875] + end + + test "triggers per window with large demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_every(12), stages: 1, key: &rem(&1, 2)) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:session, session, trigger} -> [{state, session, trigger}] end) + |> Enum.sort() == [{144, {1, 1, 49}, {:every, 12}}, + {156, {0, 2, 50}, {:every, 12}}, + {576, {1, 1, 49}, {:every, 12}}, + {600, {0, 2, 50}, {:every, 12}}, + {625, {1, 1, 49}, :done}, + {650, {0, 2, 50}, :done}, + {744, {1, 1051, 1099}, {:every, 12}}, + {756, {0, 1052, 1100}, {:every, 12}}, + {1776, {1, 1051, 1099}, {:every, 12}}, + {1800, {0, 1052, 1100}, {:every, 12}}, + {1875, {1, 1051, 1099}, :done}, + {1900, {0, 1052, 1100}, :done}] + end + + test "reduces per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window(), stages: 1, max_demand: 5, min_demand: 0, key: &rem(&1, 2)) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() == [625, 650, 1900, 1875] + end + + test "triggers per window with small demand" do + assert Flow.from_enumerable(1..100, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_every(12), + stages: 1, max_demand: 5, min_demand: 0, key: &rem(&1, 2)) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:session, session, trigger} -> [{state, session, trigger}] end) + |> Enum.sort() == [{144, {1, 1, 25}, {:every, 12}}, + {156, {0, 2, 24}, {:every, 12}}, + {576, {1, 1, 49}, {:every, 12}}, + {600, {0, 2, 50}, {:every, 12}}, + {625, {1, 1, 49}, :done}, + {650, {0, 2, 50}, :done}, + {744, {1, 1051, 1075}, {:every, 12}}, + {756, {0, 1052, 1074}, {:every, 12}}, + {1776, {1, 1051, 1099}, {:every, 12}}, + {1800, {0, 1052, 1100}, {:every, 12}}, + {1875, {1, 1051, 1099}, :done}, + {1900, {0, 1052, 1100}, :done}] + end + + test "triggers for all windows" do + assert Flow.from_enumerable(Stream.concat(1..100, Stream.timer(:infinity)), max_demand: 5, stages: 1) + |> Flow.partition(window: double_ordered_window() |> Flow.Window.trigger_periodically(100, :millisecond), + stages: 1, max_demand: 5, min_demand: 0, key: &rem(&1, 2)) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.map_state(fn state, _, {:session, session, trigger} -> [{state, session, trigger}] end) + |> Enum.take(4) == [{625, {1, 1, 49}, :done}, + {650, {0, 2, 50}, :done}, + {1900, {0, 1052, 1100}, {:periodically, 100, :millisecond}}, + {1875, {1, 1051, 1099}, {:periodically, 100, :millisecond}}] + end + end +end diff --git a/test/flow/window_test.exs b/test/flow/window_test.exs new file mode 100644 index 0000000..5d4fdbb --- /dev/null +++ b/test/flow/window_test.exs @@ -0,0 +1,21 @@ +defmodule Flow.Window.Test do + use ExUnit.Case, async: true + doctest Flow.Window + + test "periodic triggers" do + assert Flow.Window.global + |> Flow.Window.trigger_periodically(10, :second, :keep) + |> Map.fetch!(:periodically) == + [{10000, :keep, {:periodically, 10, :second}}] + + assert Flow.Window.global + |> Flow.Window.trigger_periodically(10, :minute, :keep) + |> Map.fetch!(:periodically) == + [{600000, :keep, {:periodically, 10, :minute}}] + + assert Flow.Window.global + |> Flow.Window.trigger_periodically(10, :hour, :keep) + |> Map.fetch!(:periodically) == + [{36000000, :keep, {:periodically, 10, :hour}}] + end +end diff --git a/test/flow_test.exs b/test/flow_test.exs new file mode 100644 index 0000000..4066412 --- /dev/null +++ b/test/flow_test.exs @@ -0,0 +1,797 @@ +defmodule FlowTest do + use ExUnit.Case, async: true + + doctest Flow + + defmodule Counter do + use GenStage + + def init(counter) do + {:producer, counter} + end + + def handle_demand(demand, counter) when demand > 0 do + # If the counter is 3 and we ask for 2 items, we will + # emit the items 3 and 4, and set the state to 5. + events = Enum.to_list(counter..counter+demand-1) + {:noreply, events, counter + demand} + end + end + + defmodule Forwarder do + use GenStage + + def init(parent) do + {:consumer, parent} + end + + def handle_events(events, _from, parent) do + send parent, {:consumed, events} + {:noreply, [], parent} + end + + def handle_info({{pid, ref}, {:producer, _}}, parent) do + GenStage.cancel({pid, ref}, :normal) + {:noreply, [], parent} + end + end + + describe "errors" do + test "on multiple reduce calls" do + assert_raise ArgumentError, ~r"cannot call group_by/reduce on a flow after another group_by/reduce operation", fn -> + Flow.from_enumerable([1, 2, 3]) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Enum.to_list + end + end + + test "on map_state without reduce" do + assert_raise ArgumentError, ~r"map_state/2 must be called after a group_by/reduce operation", fn -> + Flow.from_enumerable([1, 2, 3]) + |> Flow.map_state(fn x -> x end) + |> Enum.to_list + end + end + + @tag :capture_log + test "on window without computation" do + assert catch_exit( + Flow.from_enumerable([1, 2, 3], window: Flow.Window.fixed(1, :second, & &1)) |> Enum.to_list + ) + end + end + + describe "run/1" do + test "does not leave lingering messages nor monitors" do + Flow.from_enumerable(1..100, stages: 4) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() + + refute_received _ + assert Process.info(self(), :monitors) == {:monitors, []} + end + end + + describe "enumerable-stream" do + @flow Flow.from_enumerables([[1, 2, 3], [4, 5, 6]], stages: 2) + + test "only sources" do + assert @flow |> Enum.sort() == [1, 2, 3, 4, 5, 6] + end + + @tag :capture_log + test "raises locally" do + assert catch_exit(@flow |> Flow.map(fn _ -> raise "oops" end) |> Enum.to_list) + end + + test "each/2" do + parent = self() + assert @flow |> Flow.each(&send(parent, &1)) |> Enum.sort() == + [1, 2, 3, 4, 5, 6] + assert_received 1 + assert_received 2 + assert_received 3 + end + + test "filter/2" do + assert @flow |> Flow.filter(&rem(&1, 2) == 0) |> Enum.sort() == + [2, 4, 6] + end + + test "filter_map/3" do + assert @flow |> Flow.filter_map(&rem(&1, 2) == 0, & &1 * 2) |> Enum.sort() == + [4, 8, 12] + end + + test "flat_map/2" do + assert @flow |> Flow.flat_map(&[&1, &1]) |> Enum.sort() == + [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6] + end + + test "map/2" do + assert @flow |> Flow.map(& &1 * 2) |> Enum.sort() == + [2, 4, 6, 8, 10, 12] + end + + test "reject/2" do + assert @flow |> Flow.reject(&rem(&1, 2) == 0) |> Enum.sort() == + [1, 3, 5] + end + + test "uniq_by/2" do + result = @flow |> Flow.uniq_by(&rem(&1, 2)) |> Enum.sort() + assert length(result) == 2 + end + + test "keeps ordering" do + flow = + @flow + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.map(fn(x) -> x + 1 end) + |> Flow.map(fn(x) -> x * 2 end) + assert Enum.sort(flow) == [6, 10, 14] + end + + test "start_link/2" do + parent = self() + + {:ok, pid} = + @flow + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.each(&send(parent, &1)) + |> Flow.start_link() + + assert_receive 2 + assert_receive 4 + assert_receive 6 + refute_received 1 + + ref = Process.monitor(pid) + assert_receive {:DOWN, ^ref, _, _, _} + end + + test "into_stages/3" do + {:ok, forwarder} = GenStage.start_link(Forwarder, self()) + + {:ok, pid} = + @flow + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.into_stages([forwarder]) + + assert_receive {:consumed, [2]} + assert_receive {:consumed, [4, 6]} + + ref = Process.monitor(pid) + assert_receive {:DOWN, ^ref, _, _, _} + end + end + + describe "enumerable-unpartioned-stream" do + @flow Flow.from_enumerables([[1, 2, 3], [4, 5, 6]], stages: 4) + + test "only sources" do + assert @flow |> Enum.sort() == [1, 2, 3, 4, 5, 6] + end + + @tag :capture_log + test "raises locally" do + assert catch_exit(@flow |> Flow.map(fn _ -> raise "oops" end) |> Enum.to_list) + end + + test "each/2" do + parent = self() + assert @flow |> Flow.each(&send(parent, &1)) |> Enum.sort() == + [1, 2, 3, 4, 5, 6] + assert_received 1 + assert_received 2 + assert_received 3 + end + + test "filter/2" do + assert @flow |> Flow.filter(&rem(&1, 2) == 0) |> Enum.sort() == + [2, 4, 6] + end + + test "filter_map/3" do + assert @flow |> Flow.filter_map(&rem(&1, 2) == 0, & &1 * 2) |> Enum.sort() == + [4, 8, 12] + end + + test "flat_map/2" do + assert @flow |> Flow.flat_map(&[&1, &1]) |> Enum.sort() == + [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6] + end + + test "map/2" do + assert @flow |> Flow.map(& &1 * 2) |> Enum.sort() == + [2, 4, 6, 8, 10, 12] + end + + test "reject/2" do + assert @flow |> Flow.reject(&rem(&1, 2) == 0) |> Enum.sort() == + [1, 3, 5] + end + + test "reduce/3" do + assert @flow |> Flow.reduce(fn -> 0 end, &+/2) |> Flow.map_state(&[&1]) |> Enum.sum() == + 21 + end + + test "uniq_by/2" do + result = @flow |> Flow.uniq_by(&rem(&1, 2)) |> Enum.sort() + assert length(result) == 2 + end + + test "keeps ordering" do + flow = + @flow + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.map(fn(x) -> x + 1 end) + |> Flow.map(fn(x) -> x * 2 end) + assert Enum.sort(flow) == [6, 10, 14] + end + + test "allows custom windowding" do + window = + Flow.Window.fixed(1, :second, fn + x when x <= 50 -> 0 + x when x <= 100 -> 1_000 + end) + + windows = Flow.from_enumerable(1..100, window: window, stages: 4, max_demand: 5) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.to_list() + assert length(windows) == 8 + assert Enum.sum(windows) == 5050 + end + + test "start_link/2" do + parent = self() + + {:ok, pid} = + @flow + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.each(&send(parent, &1)) + |> Flow.start_link() + + assert_receive 2 + assert_receive 4 + assert_receive 6 + refute_received 1 + + ref = Process.monitor(pid) + assert_receive {:DOWN, ^ref, _, _, _} + end + + test "into_stages/3" do + {:ok, forwarder} = GenStage.start_link(Forwarder, self()) + + {:ok, _} = + @flow + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.into_stages([forwarder]) + + assert_receive {:consumed, [2]} + assert_receive {:consumed, [4, 6]} + end + end + + describe "enumerable-partitioned-stream" do + @flow Flow.from_enumerables([[1, 2, 3], [4, 5, 6], 7..10], stages: 4) + |> Flow.partition(stages: 4) + + test "only sources" do + assert Flow.from_enumerables([[1, 2, 3], [4, 5, 6], 7..10], stages: 4) + |> Flow.partition(stages: 4) + |> Enum.sort() == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + assert Flow.from_enumerables([[1, 2, 3], [4, 5, 6], 7..10], stages: 4) + |> Flow.partition(stages: 4) + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.emit(:state) + |> Enum.map(&Enum.sort/1) + |> Enum.sort() == [[1, 5, 7, 9], [2, 6, 8], [3, 4], [10]] + end + + @tag :capture_log + test "raises locally" do + assert catch_exit(@flow |> Flow.map(fn _ -> raise "oops" end) |> Enum.to_list) + end + + test "each/2" do + parent = self() + assert @flow |> Flow.each(&send(parent, &1)) |> Enum.sort() == + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + assert_received 1 + assert_received 2 + assert_received 3 + end + + test "filter/2" do + assert @flow |> Flow.filter(&rem(&1, 2) == 0) |> Enum.sort() == + [2, 4, 6, 8, 10] + end + + test "filter_map/3" do + assert @flow |> Flow.filter_map(&rem(&1, 2) == 0, & &1 * 2) |> Enum.sort() == + [4, 8, 12, 16, 20] + end + + test "flat_map/2" do + assert @flow |> Flow.flat_map(&[&1, &1]) |> Enum.sort() == + [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10] + end + + test "map/2" do + assert @flow |> Flow.map(& &1 * 2) |> Enum.sort() == + [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] + end + + test "reject/2" do + assert @flow |> Flow.reject(&rem(&1, 2) == 0) |> Enum.sort() == + [1, 3, 5, 7, 9] + end + + test "reduce/3" do + assert @flow |> Flow.reduce(fn -> 0 end, &+/2) |> Flow.map_state(&[&1]) |> Enum.sort() == + [7, 10, 16, 22] + + assert @flow |> Flow.reject(&rem(&1, 2) == 0) |> Flow.reduce(fn -> 0 end, &+/2) |> Flow.map_state(&[&1]) |> Enum.sort() == + [0, 0, 3, 22] + end + + test "uniq_by/2" do + result = @flow |> Flow.uniq_by(&rem(&1, 2)) |> Enum.sort() + assert length(result) == 5 + end + + test "uniq_by/2 after reduce/3" do + assert @flow + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.map_state(&Enum.sort/1) + |> Flow.uniq_by(&rem(&1, 2)) + |> Enum.sort() == [1, 2, 3, 4, 10] + end + + test "keeps ordering" do + flow = + @flow + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.map(fn(x) -> x + 1 end) + |> Flow.map(fn(x) -> x * 2 end) + assert Enum.sort(flow) == [6, 10, 14, 18, 22] + end + + test "keeps ordering after reduce" do + flow = + @flow + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.map(fn(x) -> x + 1 end) + |> Flow.map(fn(x) -> x * 2 end) + assert Enum.sort(flow) == [6, 10, 14, 18, 22] + end + + test "keeps ordering after reduce + map_state" do + flow = + @flow + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.map(fn(x) -> x + 1 end) + |> Flow.map(fn(x) -> x * 2 end) + |> Flow.map_state(&{&2, Enum.sort(&1)}) + |> Flow.map_state(&[&1]) + assert Enum.sort(flow) == [{{0, 4}, [6, 14, 18]}, + {{1, 4}, [22]}, + {{2, 4}, []}, + {{3, 4}, [10]}] + end + + test "start_link/2" do + parent = self() + + {:ok, pid} = + @flow + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.each(&send(parent, &1)) + |> Flow.start_link() + + assert_receive 2 + assert_receive 4 + assert_receive 6 + refute_received 1 + + ref = Process.monitor(pid) + assert_receive {:DOWN, ^ref, _, _, _} + end + + test "into_stages/3" do + {:ok, forwarder} = GenStage.start_link(Forwarder, self()) + + {:ok, _} = + @flow + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.into_stages([forwarder]) + + assert_receive {:consumed, [2]} + assert_receive {:consumed, [4]} + assert_receive {:consumed, [6]} + assert_receive {:consumed, '\b'} + assert_receive {:consumed, '\n'} + end + end + + describe "stages-unpartioned-stream" do + @tag report: [:counter] + + setup do + {:ok, pid} = GenStage.start_link(Counter, 0) + {:ok, counter: pid} + end + + test "only sources", %{counter: pid} do + assert Flow.from_stage(pid, stages: 1) + |> Enum.take(5) + |> Enum.sort() == [0, 1, 2, 3, 4] + end + + @tag :capture_log + test "raises locally" do + assert catch_exit(@flow |> Flow.map(fn _ -> raise "oops" end) |> Enum.to_list) + end + + test "each/2", %{counter: pid} do + parent = self() + assert Flow.from_stage(pid, stages: 1) + |> Flow.each(&send(parent, &1)) + |> Enum.take(5) + |> Enum.sort() == [0, 1, 2, 3, 4] + assert_received 1 + assert_received 2 + assert_received 3 + end + + test "filter/2", %{counter: pid} do + assert Flow.from_stage(pid, stages: 1) + |> Flow.filter(&rem(&1, 2) == 0) + |> Enum.take(5) + |> Enum.sort() == [0, 2, 4, 6, 8] + end + + test "filter_map/3", %{counter: pid} do + assert Flow.from_stage(pid, stages: 1) + |> Flow.filter_map(&rem(&1, 2) == 0, & &1 * 2) + |> Enum.take(5) + |> Enum.sort() == [0, 4, 8, 12, 16] + end + + test "flat_map/2", %{counter: pid} do + assert Flow.from_stage(pid, stages: 1) + |> Flow.flat_map(&[&1, &1]) + |> Enum.take(5) + |> Enum.sort() == [0, 0, 1, 1, 2] + end + + test "map/2", %{counter: pid} do + assert Flow.from_stage(pid, stages: 1) + |> Flow.map(& &1 * 2) + |> Enum.take(5) + |> Enum.sort() == [0, 2, 4, 6, 8] + end + + test "reject/2", %{counter: pid} do + assert Flow.from_stage(pid, stages: 1) + |> Flow.reject(&rem(&1, 2) == 0) + |> Enum.take(5) + |> Enum.sort() == + [1, 3, 5, 7, 9] + end + + test "keeps ordering", %{counter: pid} do + assert Flow.from_stage(pid, stages: 1) + |> Flow.filter(&rem(&1, 2) == 0) + |> Flow.map(fn(x) -> x + 1 end) + |> Flow.map(fn(x) -> x * 2 end) + |> Enum.take(5) + |> Enum.sort() == [2, 6, 10, 14, 18] + end + end + + describe "partition/2" do + test "allows custom partitioning" do + assert Flow.from_enumerables([[1, 2, 3], [4, 5, 6], 7..10]) + |> Flow.partition(hash: fn x -> {x, 0} end, stages: 4) + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.map_state(&[Enum.sort(&1)]) + |> Enum.sort() == [[], [], [], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]] + end + + test "allows element based partitioning" do + assert Flow.from_enumerables([[{1, 1}, {2, 2}, {3, 3}], [{1, 4}, {2, 5}, {3, 6}]]) + |> Flow.partition(key: {:elem, 0}, stages: 2) + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.map_state(fn acc -> [acc |> Enum.map(&elem(&1, 1)) |> Enum.sort()] end) + |> Enum.sort() == [[1, 2, 4, 5], [3, 6]] + end + + test "allows key based partitioning" do + assert Flow.from_enumerables([[%{key: 1, value: 1}, %{key: 2, value: 2}, %{key: 3, value: 3}], + [%{key: 1, value: 4}, %{key: 2, value: 5}, %{key: 3, value: 6}]]) + |> Flow.partition(key: {:key, :key}, stages: 2) + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.map_state(fn acc -> [acc |> Enum.map(& &1.value) |> Enum.sort()] end) + |> Enum.sort() == [[1, 2, 4, 5], [3, 6]] + end + + test "allows custom windowding" do + window = + Flow.Window.fixed(1, :second, fn + x when x <= 50 -> 0 + x when x <= 100 -> 1_000 + end) + + assert Flow.from_enumerable(1..100) + |> Flow.partition(window: window, stages: 4) + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.map_state(&[Enum.sum(&1)]) + |> Enum.sort() == [173, 361, 364, 377, 797, 865, 895, 1218] + end + end + + describe "take_sort/3" do + test "is equivalent to Enum.sort/3 ascending on the whole collection" do + list1 = Enum.map(1..1000, fn _ -> :rand.uniform(10000) end) + list2 = Enum.map(1..1000, fn _ -> :rand.uniform(10000) end) + list3 = Enum.map(1..1000, fn _ -> :rand.uniform(10000) end) + list4 = Enum.map(1..1000, fn _ -> :rand.uniform(10000) end) + + assert Flow.from_enumerables([list1, list2, list3, list4]) + |> Flow.partition() + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.take_sort(100) + |> Enum.at(0) == + (list1 ++ list2 ++ list3 ++ list4) |> Enum.sort |> Enum.take(100) + end + + test "is equivalent to Enum.sort/3 descending on the whole collection" do + list1 = Enum.map(1..1000, fn _ -> :rand.uniform(10000) end) + list2 = Enum.map(1..1000, fn _ -> :rand.uniform(10000) end) + list3 = Enum.map(1..1000, fn _ -> :rand.uniform(10000) end) + list4 = Enum.map(1..1000, fn _ -> :rand.uniform(10000) end) + + assert Flow.from_enumerables([list1, list2, list3, list4]) + |> Flow.partition() + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.take_sort(100, &>=/2) + |> Enum.at(0) == + (list1 ++ list2 ++ list3 ++ list4) |> Enum.sort(&>=/2) |> Enum.take(100) + end + end + + describe "departition/2" do + test "joins partitioned data" do + assert Flow.from_enumerable(1..10) + |> Flow.partition(stages: 4) + |> Flow.reduce(fn -> 0 end, &+/2) + |> Flow.departition(fn -> [] end, &[&1 | &2], &Enum.sort/1) + |> Enum.at(0) == [7, 10, 16, 22] + end + + test "joins partitioned data with window info" do + assert Flow.from_enumerable(1..10) + |> Flow.partition(stages: 4) + |> Flow.reduce(fn -> 0 end, &+/2) + |> Flow.departition(fn -> [] end, &[&1 | &2], &{&2, Enum.sort(&1)}) + |> Enum.at(0) == {:global, [7, 10, 16, 22]} + end + + test "joins uneven partitioned data" do + assert Flow.from_enumerable(1..10) + |> Flow.partition(stages: 2, window: Flow.Window.count(3), hash: fn 0 -> {0, 0}; x -> {x, 1} end) + |> Flow.reduce(fn -> 0 end, &+/2) + |> Flow.departition(fn -> [] end, &[&1 | &2], &Enum.sort/1) + |> Enum.to_list() == [[0, 6], [15], [24], [10]] + end + + test "joins partitioned data with triggers" do + assert Flow.from_enumerable(1..10) + |> Flow.partition(stages: 4, window: Flow.Window.global |> Flow.Window.trigger_every(2, :keep)) + |> Flow.reduce(fn -> 0 end, &+/2) + |> Flow.departition(fn -> [] end, &[&1 | &2], &Enum.sort/1) + |> Enum.at(0) == [6, 7, 7, 8, 10, 16, 22, 22] + + assert Flow.from_enumerable(1..10) + |> Flow.partition(stages: 4, window: Flow.Window.global |> Flow.Window.trigger_every(2, :reset)) + |> Flow.reduce(fn -> 0 end, &+/2) + |> Flow.departition(fn -> [] end, &[&1 | &2], &Enum.sort/1) + |> Enum.at(0) == [0, 0, 6, 7, 8, 8, 10, 16] + end + + test "joins partitioned data with map operations" do + assert Flow.from_enumerable(1..10) + |> Flow.partition(stages: 4) + |> Flow.reduce(fn -> 0 end, &+/2) + |> Flow.departition(fn -> [] end, &[&1 | &2], & &1) + |> Flow.map(&Enum.sort/1) + |> Enum.at(0) == [7, 10, 16, 22] + end + + test "joins partitioned data with reduce operations" do + assert Flow.from_enumerable(1..10) + |> Flow.partition(stages: 4, window: Flow.Window.global |> Flow.Window.trigger_every(2, :reset)) + |> Flow.reduce(fn -> 0 end, &+/2) + |> Flow.departition(fn -> [] end, &[&1 | &2], &Enum.sort/1) + |> Flow.reduce(fn -> 0 end, & Enum.sum(&1) + &2) + |> Flow.emit(:state) + |> Enum.at(0) == 55 + end + + test "with start_link/1" do + parent = self() + + {:ok, pid} = + Flow.from_enumerable(1..10) + |> Flow.partition(stages: 4) + |> Flow.reduce(fn -> 0 end, &+/2) + |> Flow.departition(fn -> [] end, &[&1 | &2], &send(parent, Enum.sort(&1))) + |> Flow.start_link + + assert_receive [7, 10, 16, 22] + ref = Process.monitor(pid) + assert_receive {:DOWN, ^ref, _, _, _} + end + end + + defp merged_flows(options) do + flow1 = + Stream.take_every(1..100, 2) + |> Flow.from_enumerable() + |> Flow.map(& &1 * 2) + + flow2 = + Stream.take_every(2..100, 2) + |> Flow.from_enumerable() + |> Flow.map(& &1 * 2) + + Flow.merge([flow1, flow2], options) + end + + describe "merge/2" do + test "merges different flows together" do + assert merged_flows(stages: 4, min_demand: 5) + |> Flow.reduce(fn -> 0 end, & &1 + &2) + |> Flow.emit(:state) + |> Enum.sum() == 10100 + end + + test "allows custom partitioning" do + assert merged_flows(stages: 4, min_demand: 5, hash: fn x -> {x, 0} end) + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.map_state(&[Enum.sum(&1)]) + |> Enum.sort() == [0, 0, 0, 10100] + end + + test "allows custom windowding" do + window = + Flow.Window.fixed(1, :second, fn + x when x <= 100 -> 0 + x when x <= 200 -> 1_000 + end) + + assert merged_flows(window: window, stages: 4, min_demand: 5) + |> Flow.reduce(fn -> [] end, &[&1 | &2]) + |> Flow.map_state(&[Enum.sum(&1)]) + |> Enum.sort() == [594, 596, 654, 706, 1248, 1964, 2066, 2272] + end + end + + describe "bounded_join/7" do + test "inner joins two matching flows" do + assert Flow.bounded_join(:inner, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6, 7, 8]), + & &1, & &1 - 3, &{&1, &2}) + |> Enum.sort() == [{1, 4}, {2, 5}, {3, 6}] + end + + test "inner joins two unmatching flows" do + assert Flow.bounded_join(:inner, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6, 7, 8]), + & &1, & &1, &{&1, &2}) + |> Enum.sort() == [] + end + + test "left joins two matching flows" do + assert Flow.bounded_join(:left_outer, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6, 7, 8]), + & &1, & &1 - 3, &{&1, &2}) + |> Enum.sort() == [{0, nil}, {1, 4}, {2, 5}, {3, 6}] + end + + test "left joins two unmatching flows" do + assert Flow.bounded_join(:left_outer, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6, 7, 8]), + & &1, & &1, &{&1, &2}) + |> Enum.sort() == [{0, nil}, {1, nil}, {2, nil}, {3, nil}] + end + + test "right joins two matching flows" do + assert Flow.bounded_join(:right_outer, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6, 7, 8]), + & &1, & &1 - 3, &{&1, &2}) + |> Enum.sort() == [{1, 4}, {2, 5}, {3, 6}, {nil, 7}, {nil, 8}] + end + + test "right joins two unmatching flows" do + assert Flow.bounded_join(:right_outer, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6, 7, 8]), + & &1, & &1, &{&1, &2}) + |> Enum.sort() == [{nil, 4}, {nil, 5}, {nil, 6}, {nil, 7}, {nil, 8}] + end + + test "outer joins two matching flows" do + assert Flow.bounded_join(:full_outer, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6, 7, 8]), + & &1, & &1 - 3, &{&1, &2}) + |> Enum.sort() == [{0, nil}, {1, 4}, {2, 5}, {3, 6}, {nil, 7}, {nil, 8}] + end + + test "outer joins two unmatching flows" do + assert Flow.bounded_join(:full_outer, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6, 7, 8]), + & &1, & &1, &{&1, &2}) + |> Enum.sort() == [{0, nil}, {1, nil}, {2, nil}, {3, nil}, + {nil, 4}, {nil, 5}, {nil, 6}, {nil, 7}, {nil, 8}] + end + + test "joins two flows followed by mapper operation" do + assert Flow.bounded_join(:inner, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6]), + & &1, & &1 - 3, &{&1, &2}) + |> Flow.map(fn {k, v} -> k + v end) + |> Enum.sort() == [5, 7, 9] + end + + test "joins two flows followed by reduce" do + assert Flow.bounded_join(:inner, + Flow.from_enumerable([0, 1, 2, 3]), + Flow.from_enumerable([4, 5, 6]), + & &1, & &1 - 3, &{&1, &2}, stages: 2) + |> Flow.reduce(fn -> 0 end, fn {k, v}, acc -> k + v + acc end) + |> Flow.emit(:state) + |> Enum.sort() == [9, 12] + end + + test "joins mapper and reducer flows" do + assert Flow.bounded_join(:inner, + Flow.from_enumerable(0..9) |> Flow.partition(), + Flow.from_enumerable(0..9) |> Flow.map(& &1 + 10), + & &1, & &1 - 10, &{&1, &2}, stages: 2) + |> Flow.reduce(fn -> 0 end, fn {k, v}, acc -> k + v + acc end) + |> Flow.emit(:state) + |> Enum.sort() == [44, 146] + end + + test "outer joins two flows with windows" do + window = Flow.Window.fixed(10, :millisecond, & &1) |> Flow.Window.trigger_every(2) + # Notice how 9 and 12 do not form a pair for being in different windows. + assert Flow.window_join(:full_outer, + Flow.from_enumerable([0, 1, 2, 3, 9, 10, 11]), + Flow.from_enumerable([4, 5, 6, 7, 8, 12, 13]), + window, & &1, & &1 - 3, &{&1, &2}) + |> Enum.sort() == [{0, nil}, {1, 4}, {2, 5}, {3, 6}, {9, nil}, + {10, 13}, {11, nil}, {nil, 7}, {nil, 8}, {nil, 12}] + end + end +end diff --git a/test/test_helper.exs b/test/test_helper.exs new file mode 100644 index 0000000..b9803e3 --- /dev/null +++ b/test/test_helper.exs @@ -0,0 +1 @@ +ExUnit.start(assert_receive_timeout: 200)