diff --git a/.gitignore b/.gitignore index 74db598..d476f1a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ lib/ *.cmj *.cmi *.rei +reduce.aux +reduce.log diff --git a/lean-formalisation/.github/workflows/create-release.yml b/lean-formalisation/.github/workflows/create-release.yml new file mode 100644 index 0000000..6dacf77 --- /dev/null +++ b/lean-formalisation/.github/workflows/create-release.yml @@ -0,0 +1,22 @@ +name: Create Release + +on: + push: + branches: + - 'main' + - 'master' + paths: + - 'lean-toolchain' + +jobs: + lean-release-tag: + name: Add Lean release tag + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: lean-release-tag action + uses: leanprover-community/lean-release-tag@v1 + with: + do-release: true + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/lean-formalisation/.github/workflows/lean_action_ci.yml b/lean-formalisation/.github/workflows/lean_action_ci.yml new file mode 100644 index 0000000..db09247 --- /dev/null +++ b/lean-formalisation/.github/workflows/lean_action_ci.yml @@ -0,0 +1,21 @@ +name: Lean Action CI + +on: + push: + pull_request: + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read # Read access to repository contents + pages: write # Write access to GitHub Pages + id-token: write # Write access to ID tokens + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v5 + - uses: leanprover/lean-action@v1 + - uses: leanprover-community/docgen-action@v1 diff --git a/lean-formalisation/.github/workflows/update.yml b/lean-formalisation/.github/workflows/update.yml new file mode 100644 index 0000000..96b7622 --- /dev/null +++ b/lean-formalisation/.github/workflows/update.yml @@ -0,0 +1,41 @@ +name: Update Dependencies + +on: + # schedule: # Sets a schedule to trigger the workflow + # - cron: "0 8 * * *" # Every day at 08:00 AM UTC (see https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#schedule) + workflow_dispatch: # Allows the workflow to be triggered manually via the GitHub interface + +jobs: + check-for-updates: # Determines which updates to apply. + runs-on: ubuntu-latest + outputs: + is-update-available: ${{ steps.check-for-updates.outputs.is-update-available }} + new-tags: ${{ steps.check-for-updates.outputs.new-tags }} + steps: + - name: Run the action + id: check-for-updates + uses: leanprover-community/mathlib-update-action@v1 + # START CONFIGURATION BLOCK 1 + # END CONFIGURATION BLOCK 1 + do-update: # Runs the upgrade, tests it, and makes a PR/issue/commit. + runs-on: ubuntu-latest + permissions: + contents: write # Grants permission to push changes to the repository + issues: write # Grants permission to create or update issues + pull-requests: write # Grants permission to create or update pull requests + needs: check-for-updates + if: ${{ needs.check-for-updates.outputs.is-update-available == 'true' }} + strategy: # Runs for each update discovered by the `check-for-updates` job. + max-parallel: 1 # Ensures that the PRs/issues are created in order. + matrix: + tag: ${{ fromJSON(needs.check-for-updates.outputs.new-tags) }} + steps: + - name: Run the action + id: update-the-repo + uses: leanprover-community/mathlib-update-action/do-update@v1 + with: + tag: ${{ matrix.tag }} + # START CONFIGURATION BLOCK 2 + on_update_succeeds: pr # Create a pull request if the update succeeds + on_update_fails: issue # Create an issue if the update fails + # END CONFIGURATION BLOCK 2 diff --git a/lean-formalisation/.gitignore b/lean-formalisation/.gitignore new file mode 100644 index 0000000..bfb30ec --- /dev/null +++ b/lean-formalisation/.gitignore @@ -0,0 +1 @@ +/.lake diff --git a/lean-formalisation/README.md b/lean-formalisation/README.md new file mode 100644 index 0000000..993e9b8 --- /dev/null +++ b/lean-formalisation/README.md @@ -0,0 +1,13 @@ +# lean-formalisation + +## GitHub configuration + +To set up your new GitHub repository, follow these steps: + +* Under your repository name, click **Settings**. +* In the **Actions** section of the sidebar, click "General". +* Check the box **Allow GitHub Actions to create and approve pull requests**. +* Click the **Pages** section of the settings sidebar. +* In the **Source** dropdown menu, select "GitHub Actions". + +After following the steps above, you can remove this section from the README file. diff --git a/lean-formalisation/Reduce.lean b/lean-formalisation/Reduce.lean new file mode 100644 index 0000000..87883f4 --- /dev/null +++ b/lean-formalisation/Reduce.lean @@ -0,0 +1,625 @@ +import Mathlib.Data.Multiset.Basic +import Mathlib.Data.Multiset.AddSub + +open Multiset +open List + +universe u v + +namespace Reduce + +-- Section 2: Preliminaries. +-- Definition (Collection). +-- A collection is a function from keys to multisets of values. +abbrev Collection (K : Type u) (V : Type v) := + K → Multiset V + +-- Section 2: Preliminaries. +-- Update operation A × V → A, written in curried Lean style. +abbrev UpdateOp (A : Type u) (V : Type v) := + A → V → A + +-- Section 2: Preliminaries. +-- Definition (Pairwise Commutative Operation). +-- Pairwise commutativity: (a ⋆ v₁) ⋆ v₂ = (a ⋆ v₂) ⋆ v₁. +def pairwiseComm {A : Type u} {V : Type v} (op : UpdateOp A V) : Prop := + ∀ (a : A) (v₁ v₂ : V), op (op a v₁) v₂ = op (op a v₂) v₁ + +-- Section 2: Preliminaries. +-- Definition (Fold over Sequence for an Operation). +-- Fold over a finite sequence (list) for an update operation. +def foldSeq {A : Type u} {V : Type v} (op : UpdateOp A V) : A → List V → A + | a, [] => a + | a, v :: vs => foldSeq op (op a v) vs + +-- Section 2: Preliminaries. +-- Auxiliary property used in Theorem 1: +-- order-independence of folding over lists (foldSeq depends only on the permutation class). +def orderIndependent {A : Type u} {V : Type v} (op : UpdateOp A V) : Prop := + ∀ (a : A) (l₁ l₂ : List V), l₁.Perm l₂ → foldSeq op a l₁ = foldSeq op a l₂ + +-- Section 2: Preliminaries. +-- Definition (Fold over Multiset for an Operation), via an enumeration. +noncomputable def foldMultiset {A : Type u} {V : Type v} + (op : UpdateOp A V) (a : A) (s : Multiset V) : A := + foldSeq op a s.toList + +-- Sanity check: collections and a simple numeric update op. + +example {K V : Type u} (C : Collection K V) (k : K) : + Multiset V := + C k + +def addOp : UpdateOp Nat Nat := + fun a v => a + v + +example : pairwiseComm addOp := by + intro a v₁ v₂ + simp [addOp, Nat.add_comm, Nat.add_left_comm] + +-- Section 2: Theorem 1 (Characterisation of Multiset Fold). +-- One direction: order-independence implies pairwise commutativity. +theorem characterisationMultisetFold_forward {A : Type u} {V : Type v} + {op : UpdateOp A V} (h : orderIndependent op) : + pairwiseComm op := by + intro a v₁ v₂ + -- [v₁, v₂] is a permutation of [v₂, v₁] + have hperm : List.Perm [v₁, v₂] [v₂, v₁] := by + simpa using (List.Perm.swap v₂ v₁ []) + -- apply order-independence and expand the folds + have hfold := h a [v₁, v₂] [v₂, v₁] hperm + simpa [foldSeq] using hfold + +-- Section 2: Preliminaries (helper for Theorem 1). +-- foldSeq is invariant under list permutations, assuming pairwise commutativity. +theorem foldSeq_perm {A : Type u} {V : Type v} + (op : UpdateOp A V) (hcomm : pairwiseComm op) : + ∀ (a : A) {l₁ l₂ : List V}, l₁.Perm l₂ → foldSeq op a l₁ = foldSeq op a l₂ := by + intro a l₁ l₂ hperm + induction hperm generalizing a with + | nil => + rfl + | @cons x l₁ l₂ hperm ih => + -- x :: l₁ ~ x :: l₂ + simp [foldSeq, ih] -- both sides reduce to foldSeq op (op a x) … + | @swap x y l => + -- y :: x :: l ~ x :: y :: l + simp [foldSeq] -- goal: foldSeq op (op (op a y) x) l = foldSeq op (op (op a x) y) l + have hxy : op (op a y) x = op (op a x) y := by + have := hcomm a x y + -- hcomm: op (op a x) y = op (op a y) x + simpa using this.symm + simp [hxy] + | @trans l₁ l₂ l₃ h₁ h₂ ih₁ ih₂ => + -- l₁ ~ l₂ and l₂ ~ l₃ ⇒ l₁ ~ l₃ + calc + foldSeq op a l₁ = foldSeq op a l₂ := ih₁ a + _ = foldSeq op a l₃ := ih₂ a + +-- Section 2: Preliminaries. +-- Fold over append: folding over l₁ ++ l₂ is folding over l₁ then l₂. +lemma foldSeq_append {A : Type u} {V : Type v} + (op : UpdateOp A V) (a : A) (l₁ l₂ : List V) : + foldSeq op a (l₁ ++ l₂) = foldSeq op (foldSeq op a l₁) l₂ := by + induction l₁ generalizing a with + | nil => + -- ([] ++ l₂) = l₂ + simp [foldSeq] + | cons x xs ih => + -- ((x :: xs) ++ l₂) = x :: (xs ++ l₂) + simp [foldSeq, ih, List.cons_append] + +-- Section 2: Preliminaries. +-- Fold over Multiset is independent of the chosen enumeration (well-definedness). +lemma foldMultiset_wellDefined {A : Type u} {V : Type v} + (op : UpdateOp A V) (hcomm : pairwiseComm op) (a : A) + (s : Multiset V) (l : List V) (h : (l : Multiset V) = s) : + foldSeq op a l = foldMultiset op a s := by + classical + have h_toList : (s.toList : Multiset V) = s := Multiset.coe_toList s + have h_eq : (l : Multiset V) = (s.toList : Multiset V) := + h.trans h_toList.symm + have hperm : l.Perm s.toList := + (Multiset.coe_eq_coe (α := V)).1 h_eq + have hfold := foldSeq_perm op hcomm a hperm + simpa [foldMultiset] using hfold + +-- Section 2: Preliminaries. +-- Lemma (Fold over Union of Multisets). +lemma foldMultiset_union {A : Type u} {V : Type v} + (op : UpdateOp A V) (hcomm : pairwiseComm op) (a : A) + (M N : Multiset V) : + foldMultiset op a (M + N) = + foldMultiset op (foldMultiset op a M) N := by + classical + -- Represent `M + N` by the concatenation of `M.toList` and `N.toList`. + have hMN : + ((M.toList ++ N.toList : List V) : Multiset V) = M + N := by + -- First, relate `M.toList ++ N.toList` to the multiset sum of the lists. + have h₁ : + ((M.toList ++ N.toList : List V) : Multiset V) = + (M.toList + N.toList : Multiset V) := by + exact (Multiset.coe_add (M.toList) (N.toList)).symm + -- Then rewrite the multiset sum of the lists as `M + N`. + have h₂ : + (M.toList + N.toList : Multiset V) = M + N := by + simp [Multiset.coe_toList] + exact h₁.trans h₂ + -- Use well-definedness to switch from `foldMultiset` on `M + N` + -- to a fold over the concatenated list. + have h_fold : + foldMultiset op a (M + N) = + foldSeq op a (M.toList ++ N.toList) := by + have h0 := + foldMultiset_wellDefined (op := op) (hcomm := hcomm) (a := a) + (s := M + N) (l := M.toList ++ N.toList) hMN + -- `foldMultiset_wellDefined` gives `foldSeq = foldMultiset`, so we flip it. + simpa [foldMultiset] using h0.symm + -- Now use the list-level append lemma and unfold `foldMultiset` on `M` and `N`. + have h_append : + foldSeq op a (M.toList ++ N.toList) = + foldSeq op (foldSeq op a M.toList) N.toList := by + simpa using foldSeq_append op a M.toList N.toList + have h_rhs : + foldSeq op (foldSeq op a M.toList) N.toList = + foldMultiset op (foldMultiset op a M) N := by + simp [foldMultiset] + -- Put the pieces together. + calc + foldMultiset op a (M + N) + = foldSeq op a (M.toList ++ N.toList) := h_fold + _ = foldSeq op (foldSeq op a M.toList) N.toList := h_append + _ = foldMultiset op (foldMultiset op a M) N := h_rhs + +-- Section 2: Theorem 1 (Characterisation of Multiset Fold). +-- Converse direction: pairwise commutativity implies order-independence. +theorem characterisationMultisetFold_backward {A : Type u} {V : Type v} + {op : UpdateOp A V} (hcomm : pairwiseComm op) : + orderIndependent op := by + intro a l₁ l₂ hperm + exact foldSeq_perm op hcomm a hperm + +-- Section 2: Theorem 1 (Characterisation of Multiset Fold), as an equivalence. +theorem characterisationMultisetFold {A : Type u} {V : Type v} + (op : UpdateOp A V) : + orderIndependent op ↔ pairwiseComm op := + ⟨characterisationMultisetFold_forward, characterisationMultisetFold_backward⟩ + + +-- Section 4: Incremental Updates. +-- Definition (Reducer). +-- A reducer packs an initial value and add/remove update operations. +structure Reducer (A : Type u) (V : Type v) where + ι : A + add : UpdateOp A V + remove : UpdateOp A V + +-- Section 4: Incremental Updates. +-- Reducer-level pairwise commutativity (both add and remove). +def Reducer.pairwiseComm {A : Type u} {V : Type v} (R : Reducer A V) : Prop := + Reduce.pairwiseComm R.add ∧ Reduce.pairwiseComm R.remove + +-- Section 4: Incremental Updates. +-- Definition (Well-Formed Reducer). +-- Well-formedness: remove undoes add on reachable accumulator values. +def WellFormedReducer {A : Type u} {V : Type v} (R : Reducer A V) : Prop := + ∀ (M : Multiset V) (v : V), + R.remove (R.add (foldMultiset R.add R.ι M) v) v = + foldMultiset R.add R.ι M + +-- Section 4: Aggregate Classes. +-- Definition (Distributive Aggregate). +def DistributiveAggregate {A : Type u} {V : Type v} + (ι : A) (op : UpdateOp A V) : Prop := + ∀ (M N : Multiset V), + foldMultiset op ι (M + N) = + foldMultiset op (foldMultiset op ι M) N + +-- Section 4: Aggregate Classes. +-- Pairwise commutativity implies the distributive-aggregate law (Lemma 1). +lemma distributiveAggregate_of_pairwiseComm {A : Type u} {V : Type v} + (ι : A) {op : UpdateOp A V} (hcomm : pairwiseComm op) : + DistributiveAggregate ι op := by + intro M N + simpa [DistributiveAggregate] using + (foldMultiset_union (op := op) (hcomm := hcomm) (a := ι) M N) + +-- Section 4: Aggregate Classes. +-- Definition (Invertible Distributive Aggregate) for a reducer. +def InvertibleDistributiveAggregate {A : Type u} {V : Type v} + (R : Reducer A V) : Prop := + DistributiveAggregate R.ι R.add ∧ WellFormedReducer R + +-- Section 4: Aggregate Classes. +-- For reducers with pairwise-commutative add/remove, well-formedness +-- is equivalent to being an invertible distributive aggregate. +lemma wellFormedReducer_iff_invertibleDistributive {A : Type u} {V : Type v} + (R : Reducer A V) (hcomm : Reducer.pairwiseComm R) : + InvertibleDistributiveAggregate R ↔ WellFormedReducer R := by + constructor + · intro h + exact h.2 + · intro hWF + -- distributivity follows from pairwise commutativity of `add` + exact And.intro (distributiveAggregate_of_pairwiseComm R.ι hcomm.1) hWF + +-- Section 3: Reduce Combinator. +-- Definition (Reduce Combinator). +-- The reduce combinator for a reducer and a collection. +noncomputable def reduce {K : Type u} {A : Type u} {V : Type v} + (R : Reducer A V) (C : Collection K V) : K → A := + fun k => foldMultiset R.add R.ι (C k) + +-- Section 4: Incremental Updates. +-- Definition (Delta). +-- Deltas: added and removed multisets of values per key. +structure Delta (K : Type u) (V : Type v) where + added : K → Multiset V + removed : K → Multiset V + +-- Section 4: Incremental Updates. +-- Definition (Valid Delta for a Collection). +def ValidDelta {K : Type u} {V : Type v} + (C : Collection K V) (Δ : Delta K V) : Prop := + ∀ k, Δ.removed k ≤ C k + +-- Section 4: Incremental Updates. +-- Definition (Delta Application) on collections. +noncomputable def applyDelta {K : Type u} {V : Type v} [DecidableEq V] + (C : Collection K V) (Δ : Delta K V) : Collection K V := + fun k => (Multiset.sub (C k) (Δ.removed k)) + Δ.added k + +-- Section 4: Incremental Updates. +-- Definition (Incremental Reduce). +-- Incremental update: first remove, then add. +noncomputable def update {K : Type u} {A : Type u} {V : Type v} + (R : Reducer A V) (a : A) (Δ : Delta K V) (k : K) : A := + let a' := foldMultiset R.remove a (Δ.removed k) + foldMultiset R.add a' (Δ.added k) + +-- ------------------------------------------------------------- +-- Section 5: Correctness. +-- Incremental correctness property (per key) for a reducer. +def IncrementalCorrectnessProperty {K : Type u} {A : Type u} {V : Type v} + [DecidableEq V] (R : Reducer A V) : Prop := + ∀ (C : Collection K V) (Δ : Delta K V), + ValidDelta C Δ → + ∀ k : K, + reduce R (applyDelta C Δ) k = + update R (reduce R C k) Δ k + +-- The remaining Section 5 lemmas from the paper are left as +-- commented-out skeletons for now; we will turn them into +-- fully proved Lean lemmas one by one. + +-- Helper lemma: cancellation over a decomposed multiset M₀ ⊎ D. +lemma cancellation_decomposed {A : Type u} {V : Type v} + (R : Reducer A V) + (hWF : WellFormedReducer R) + (hAdd : pairwiseComm R.add) + (hRem : pairwiseComm R.remove) : + ∀ (D M₀ : Multiset V), + foldMultiset R.remove (foldMultiset R.add R.ι (M₀ + D)) D = + foldMultiset R.add R.ι M₀ := by + classical + intro D + -- Induction on the multiset `D`. + refine Multiset.induction_on D ?base ?step + · -- Base case: D = 0. + intro M₀ + simp [foldMultiset, foldSeq] + · -- Inductive step: D = v ::ₘ D' + intro v D' ih M₀ + -- Let M' := M₀ + D'. + let M' : Multiset V := M₀ + D' + -- First, relate M₀ + (v ::ₘ D') to M' + {v}. + have hM : M₀ + v ::ₘ D' = M' + ({v} : Multiset V) := by + -- M₀ + v ::ₘ D' = v ::ₘ (M₀ + D') + have h1 : M₀ + v ::ₘ D' = v ::ₘ (M₀ + D') := by + exact (Multiset.add_cons v M₀ D') + -- v ::ₘ (M₀ + D') = {v} + (M₀ + D') + have h2 : v ::ₘ (M₀ + D') = ({v} : Multiset V) + (M₀ + D') := by + -- `singleton_add` already states `{v} + (M₀ + D') = v ::ₘ (M₀ + D')`. + -- We just use it in the desired orientation. + exact (Multiset.singleton_add v (M₀ + D')).symm + -- {v} + (M₀ + D') = (M₀ + D') + {v} = M' + {v} + calc + M₀ + v ::ₘ D' + = v ::ₘ (M₀ + D') := h1 + _ = ({v} : Multiset V) + (M₀ + D') := h2 + _ = (M₀ + D') + ({v} : Multiset V) := by + simpa using + (Multiset.add_comm (({v} : Multiset V)) (M₀ + D')) + _ = M' + ({v} : Multiset V) := by + simp [M'] + -- Fold-add over M₀ + (v ::ₘ D') factors as fold-add over M' and then the singleton {v}. + have hAgg : + foldMultiset R.add R.ι (M₀ + v ::ₘ D') = + R.add (foldMultiset R.add R.ι M') v := by + -- Use the distributivity lemma for `add` on M' and {v}. + have h_union : + foldMultiset R.add R.ι (M' + ({v} : Multiset V)) = + foldMultiset R.add (foldMultiset R.add R.ι M') ({v} : Multiset V) := + foldMultiset_union (op := R.add) (hcomm := hAdd) + (a := R.ι) (M := M') (N := ({v} : Multiset V)) + -- Rewrite M' + {v} back to M₀ + v ::ₘ D', and simplify the singleton fold. + have h_union' : + foldMultiset R.add R.ι (M₀ + v ::ₘ D') = + foldMultiset R.add (foldMultiset R.add R.ι M') ({v} : Multiset V) := by + -- First rewrite the multiset argument using `hM`, then apply `h_union`. + calc + foldMultiset R.add R.ι (M₀ + v ::ₘ D') + = foldMultiset R.add R.ι (M' + ({v} : Multiset V)) := by + simp [hM] + _ = foldMultiset R.add (foldMultiset R.add R.ι M') ({v} : Multiset V) := + h_union + -- Over a singleton multiset, `foldMultiset` is a single application of `add`. + simpa [foldMultiset, foldSeq] using h_union' + -- Removing the singleton {v} cancels the last add, by well-formedness. + have h_remove_singleton : + foldMultiset R.remove + (foldMultiset R.add R.ι (M₀ + v ::ₘ D')) ({v} : Multiset V) = + foldMultiset R.add R.ι M' := by + -- First identify the accumulator using `hAgg`, then apply well-formedness. + have hWF_M' := hWF M' v + -- From `hAgg` and well-formedness we get a cancellation equation at `v`. + have h_cancel : + R.remove (foldMultiset R.add R.ι (M₀ + v ::ₘ D')) v = + foldMultiset R.add R.ι M' := by + -- Rewrite the accumulator using `hAgg`, then apply `hWF_M'`. + have hAgg' : + R.remove (foldMultiset R.add R.ι (M₀ + v ::ₘ D')) v = + R.remove (R.add (foldMultiset R.add R.ι M') v) v := + congrArg (fun x => R.remove x v) hAgg + exact hAgg'.trans hWF_M' + -- `foldMultiset` over a singleton is just one application of the op. + simpa [foldMultiset, foldSeq] using h_cancel + -- Now decompose the fold over D = v ::ₘ D' as a fold over {v} then D'. + calc + foldMultiset R.remove (foldMultiset R.add R.ι (M₀ + v ::ₘ D')) (v ::ₘ D') + = foldMultiset R.remove + (foldMultiset R.add R.ι (M₀ + v ::ₘ D')) + (({v} : Multiset V) + D') := by + -- Rewrite `v ::ₘ D'` as `{v} + D'`. + have hD : (v ::ₘ D') = ({v} : Multiset V) + D' := by + -- `singleton_add` already states `{v} + D' = v ::ₘ D'`. + -- We just use it in the desired orientation. + exact (Multiset.singleton_add v D').symm + exact congrArg + (fun s => foldMultiset R.remove + (foldMultiset R.add R.ι (M₀ + v ::ₘ D')) s) + hD + _ = foldMultiset R.remove + (foldMultiset R.remove + (foldMultiset R.add R.ι (M₀ + v ::ₘ D')) + ({v} : Multiset V)) + D' := by + -- Apply the distributivity lemma for `remove` over `{v} + D'`. + exact + (foldMultiset_union (op := R.remove) (hcomm := hRem) + (a := foldMultiset R.add R.ι (M₀ + v ::ₘ D')) + (M := ({v} : Multiset V)) (N := D')) + _ = foldMultiset R.remove + (foldMultiset R.add R.ι M') D' := by + -- Cancel the `{v}` using `h_remove_singleton`. + exact + congrArg + (fun x => foldMultiset R.remove x D') + h_remove_singleton + _ = foldMultiset R.add R.ι M₀ := by + -- Use the induction hypothesis on D' and M₀, noting M' = M₀ + D'. + simpa [M'] using ih M₀ + +-- Section 5: Correctness. +-- Cancellation for a Delta, phrased for arbitrary collections. +lemma cancellationForDelta {K : Type u} {A : Type u} {V : Type v} + [DecidableEq V] + (R : Reducer A V) + (hWF : WellFormedReducer R) + (hcomm : Reducer.pairwiseComm R) + (C : Collection K V) (Δ : Delta K V) + (hΔ : ValidDelta C Δ) (k : K) : + foldMultiset R.remove + (foldMultiset R.add R.ι (C k)) (Δ.removed k) = + foldMultiset R.add R.ι ((C k) - Δ.removed k) := by + classical + -- Notation: M is the original multiset, D the removed part. + let M : Multiset V := C k + let D : Multiset V := Δ.removed k + have hle : D ≤ M := by + -- ValidDelta tells us that removed k ≤ C k. + simpa [M, D] using hΔ k + -- Let M₀ be the remaining multiset after removing D. + let M₀ : Multiset V := M - D + -- Using multiset subtraction, we know that (M - D) + D = M. + have h_decomp : M₀ + D = M := by + -- `sub_add_cancel` yields `M - D + D = M`. + simpa [M₀] using (Multiset.sub_add_cancel (s := M) (t := D) hle) + -- Apply cancellation over the decomposed multiset M₀ ⊎ D. + have h_cancel := + cancellation_decomposed (R := R) (hWF := hWF) + (hAdd := hcomm.1) (hRem := hcomm.2) D M₀ + -- Rewrite the accumulator using the decomposition M = M₀ + D. + have h_cancel' : + foldMultiset R.remove + (foldMultiset R.add R.ι M) D = + foldMultiset R.add R.ι M₀ := by + -- `h_cancel` is stated with `M₀ + D`; rewrite it to use `M`. + simpa [h_decomp] using h_cancel + -- Finally, unfold the local definitions of M, D, M₀. + simpa [M, D, M₀] using h_cancel' + +-- Section 5: Correctness. +-- (Soundness direction) Well-formed reducers satisfy incremental correctness. +theorem wellFormedReducer_implies_incrementalCorrectness + {K : Type u} {A : Type u} {V : Type v} [DecidableEq V] + (R : Reducer A V) (hWF : WellFormedReducer R) (hcomm : Reducer.pairwiseComm R) : + IncrementalCorrectnessProperty (K := K) R := by + classical + -- We unfold the incremental correctness property and work pointwise. + intro C Δ hΔ k + -- Notation at the key k. + let M : Multiset V := C k + let Dminus : Multiset V := Δ.removed k + let Dplus : Multiset V := Δ.added k + -- Distributivity of the add operation. + have hDA : DistributiveAggregate R.ι R.add := + distributiveAggregate_of_pairwiseComm R.ι hcomm.1 + -- First compute the denotational recomputation path. + have h_recompute : + reduce R (applyDelta C Δ) k = + foldMultiset R.add R.ι ((Multiset.sub M Dminus) + Dplus) := by + -- `applyDelta` removes `Dminus` and then adds `Dplus`. + simp [reduce, applyDelta, M, Dminus, Dplus] + -- Next compute the incremental update path. + have h_incremental : + update R (reduce R C k) Δ k = + foldMultiset R.add + (foldMultiset R.add R.ι (Multiset.sub M Dminus)) Dplus := by + -- Start from the definition of `update`, then simplify the removal step + -- using `cancellationForDelta`, and finally rewrite using our notation. + have h_cancel := + cancellationForDelta (R := R) (hWF := hWF) (hcomm := hcomm) + (C := C) (Δ := Δ) (hΔ := hΔ) (k := k) + -- `h_cancel` simplifies the intermediate accumulator after removals. + have h_cancel' : + foldMultiset R.remove + (foldMultiset R.add R.ι M) Dminus = + foldMultiset R.add R.ι (Multiset.sub M Dminus) := by + -- This is just `h_cancel` specialized at `k`, rewritten with our + -- local names for `M` and `Dminus`. + -- Note that `M - Dminus` is definitionally `Multiset.sub M Dminus`. + simpa [M, Dminus] using h_cancel + -- Now expand `update` and `reduce` and use `h_cancel'`. + simp [update, reduce, M, Dminus, Dplus, h_cancel'] + -- Finally, connect both paths via distributivity of `add`. + have h_distrib : + foldMultiset R.add R.ι ((Multiset.sub M Dminus) + Dplus) = + foldMultiset R.add + (foldMultiset R.add R.ι (Multiset.sub M Dminus)) Dplus := + hDA (Multiset.sub M Dminus) Dplus + -- Chain the equalities to conclude. + calc + reduce R (applyDelta C Δ) k + = foldMultiset R.add R.ι ((M - Dminus) + Dplus) := h_recompute + _ = foldMultiset R.add + (foldMultiset R.add R.ι (M - Dminus)) Dplus := h_distrib + _ = update R (reduce R C k) Δ k := h_incremental.symm + +-- Section 5: Correctness. +-- (Full characterization) Well-formedness ⇔ incremental correctness. +-- We assume `K` is inhabited so that the incremental +-- correctness property is non-vacuous. +theorem wellFormedReducer_iff_incrementalCorrectness + {K : Type u} {A : Type u} {V : Type v} [DecidableEq V] [Inhabited K] + (R : Reducer A V) (hcomm : Reducer.pairwiseComm R) : + WellFormedReducer R ↔ IncrementalCorrectnessProperty (K := K) R := by + constructor + · -- Soundness direction: well-formed ⇒ incremental correctness. + intro hWF + exact wellFormedReducer_implies_incrementalCorrectness + (K := K) (R := R) hWF hcomm + · -- Completeness direction: incremental correctness ⇒ well-formedness. + intro hIC + classical + -- We must show that remove undoes add on reachable + -- accumulator values, i.e. on folds over multisets. + refine fun M v => ?_ + -- Define a constant collection with a single logical key, + -- whose multiset is `M ⊎ {v}` at every key. + let C' : Collection K V := fun _ => M + ({v} : Multiset V) + -- Define a delta that removes `{v}` and adds nothing. + let Δ' : Delta K V := + { added := fun _ => (0 : Multiset V) + removed := fun _ => ({v} : Multiset V) } + -- This delta is valid for `C'`: the removed multiset + -- is always a sub-multiset of `M ⊎ {v}`. + have hValid : ValidDelta C' Δ' := by + intro k + -- `{v} ≤ M ⊎ {v}` since `v ∈ M ⊎ {v}`. + have hv : v ∈ C' k := by + simp [C'] + exact (Multiset.singleton_le (a := v) (s := C' k)).2 hv + -- Instantiate incremental correctness at `C'`, `Δ'` + -- and an arbitrary key (the default inhabitant of `K`). + have hEq := hIC (C := C') (Δ := Δ') hValid (default : K) + -- Simplify both sides of the correctness equation. + -- Left-hand side: recompute via `applyDelta`. + have h_left : + reduce R (applyDelta C' Δ') (default : K) = + foldMultiset R.add R.ι M := by + -- `applyDelta` performs `(M ⊎ {v}) - {v} + ∅ = M`. + have h_sub : + Multiset.sub (M + ({v} : Multiset V)) ({v} : Multiset V) = M := by + classical + -- Rewrite via singleton subtraction and erase laws. + have hv : v ∈ ({v} : Multiset V) := by simp + calc + Multiset.sub (M + ({v} : Multiset V)) ({v} : Multiset V) + = (M + ({v} : Multiset V)) - ({v} : Multiset V) := rfl + _ = (M + ({v} : Multiset V)).erase v := by + simp [Multiset.sub_singleton] + _ = M + ({v} : Multiset V).erase v := by + -- Remove `v` from the right summand `{v}`. + simp [Multiset.erase_add_right_pos, hv] + _ = M + 0 := by + simp + _ = M := by + simp + simp [reduce, applyDelta, C', Δ', h_sub] + -- Right-hand side: incremental path via `update`. + have h_right : + update R (reduce R C' (default : K)) Δ' (default : K) = + foldMultiset R.remove + (foldMultiset R.add R.ι (M + ({v} : Multiset V))) + ({v} : Multiset V) := by + -- First unfold `reduce` at `C'`, then `update`, + -- and finally simplify the empty-add case. + simp [reduce, update, C', Δ', foldMultiset, foldSeq] + -- Combine the simplified forms to get an equation + -- between the two explicit accumulators. + have h_fold : + foldMultiset R.add R.ι M = + foldMultiset R.remove + (foldMultiset R.add R.ι (M + ({v} : Multiset V))) + ({v} : Multiset V) := by + -- Rewrite both sides of `hEq` using `h_left` and `h_right`. + -- Note that `h_left` and `h_right` are definitional equalities, + -- so we can substitute them into `hEq`. + simpa [h_left, h_right] using hEq + -- Now rewrite the accumulator over `M ⊎ {v}` using + -- the distributive law for the add operation, exploiting + -- pairwise commutativity of `add`. + have h_add_singleton : + foldMultiset R.add R.ι (M + ({v} : Multiset V)) = + R.add (foldMultiset R.add R.ι M) v := by + -- First use the multiset-union fold lemma, then simplify + -- the fold over the singleton `{v}`. + have h_union : + foldMultiset R.add R.ι (M + ({v} : Multiset V)) = + foldMultiset R.add + (foldMultiset R.add R.ι M) + ({v} : Multiset V) := + foldMultiset_union (op := R.add) (hcomm := hcomm.1) + (a := R.ι) (M := M) (N := ({v} : Multiset V)) + -- Over a singleton multiset, `foldMultiset` applies `add` once. + simpa [foldMultiset, foldSeq] using h_union + -- Substitute `h_add_singleton` into `h_fold` and + -- expand the fold over `{v}` on the remove side. + have h_final : + foldMultiset R.add R.ι M = + R.remove (R.add (foldMultiset R.add R.ι M) v) v := by + -- Rewrite the accumulator over `M ⊎ {v}` and then + -- expand the fold over the singleton `{v}`. + have h_fold' : + foldMultiset R.add R.ι M = + foldMultiset R.remove + (R.add (foldMultiset R.add R.ι M) v) + ({v} : Multiset V) := by + simpa [h_add_singleton] using h_fold + -- Now expand the fold over `{v}`. + simpa [foldMultiset, foldSeq] using h_fold' + -- Rearrange `h_final` to match the well-formedness statement. + exact h_final.symm + +end Reduce diff --git a/lean-formalisation/Reduce/Basic.lean b/lean-formalisation/Reduce/Basic.lean new file mode 100644 index 0000000..a0437ce --- /dev/null +++ b/lean-formalisation/Reduce/Basic.lean @@ -0,0 +1 @@ +import Reduce diff --git a/lean-formalisation/lake-manifest.json b/lean-formalisation/lake-manifest.json new file mode 100644 index 0000000..85e7b19 --- /dev/null +++ b/lean-formalisation/lake-manifest.json @@ -0,0 +1,95 @@ +{"version": "1.1.0", + "packagesDir": ".lake/packages", + "packages": + [{"url": "https://github.com/leanprover-community/mathlib4", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "d14bbde96b6a617c26108b416d0ae41c3ad65def", + "name": "mathlib", + "manifestFile": "lake-manifest.json", + "inputRev": "master", + "inherited": false, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/leanprover-community/plausible", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "74835c84b38e4070b8240a063c6417c767e551ae", + "name": "plausible", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/LeanSearchClient", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "3591c3f664ac3719c4c86e4483e21e228707bfa2", + "name": "LeanSearchClient", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/import-graph", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "6e3bb4bf31f731ab28891fe229eb347ec7d5dad3", + "name": "importGraph", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/ProofWidgets4", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "2aaad968dd10a168b644b6a5afd4b92496af4710", + "name": "proofwidgets", + "manifestFile": "lake-manifest.json", + "inputRev": "v0.0.82", + "inherited": true, + "configFile": "lakefile.lean"}, + {"url": "https://github.com/leanprover-community/aesop", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "ea86e311a31a4dfa2abf3d7c0664b8c28499369e", + "name": "aesop", + "manifestFile": "lake-manifest.json", + "inputRev": "master", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/quote4", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "a31845b5557fd5e47d52b9e2977a1b0eff3c38c3", + "name": "Qq", + "manifestFile": "lake-manifest.json", + "inputRev": "master", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover-community/batteries", + "type": "git", + "subDir": null, + "scope": "leanprover-community", + "rev": "c0b0d08446817bd410548d9ea8ddcccf5d89f63f", + "name": "batteries", + "manifestFile": "lake-manifest.json", + "inputRev": "main", + "inherited": true, + "configFile": "lakefile.toml"}, + {"url": "https://github.com/leanprover/lean4-cli", + "type": "git", + "subDir": null, + "scope": "leanprover", + "rev": "7e1ced9e049a4fab2508980ec4877ca9c46dffc9", + "name": "Cli", + "manifestFile": "lake-manifest.json", + "inputRev": "v4.26.0-rc2", + "inherited": true, + "configFile": "lakefile.toml"}], + "name": "«lean-formalisation»", + "lakeDir": ".lake"} diff --git a/lean-formalisation/lakefile.toml b/lean-formalisation/lakefile.toml new file mode 100644 index 0000000..788973c --- /dev/null +++ b/lean-formalisation/lakefile.toml @@ -0,0 +1,17 @@ +name = "lean-formalisation" +version = "0.1.0" +keywords = ["math"] +defaultTargets = ["Reduce"] + +[leanOptions] +pp.unicode.fun = true # pretty-prints `fun a ↦ b` +relaxedAutoImplicit = false +weak.linter.mathlibStandardSet = true +maxSynthPendingDepth = 3 + +[[require]] +name = "mathlib" +scope = "leanprover-community" + +[[lean_lib]] +name = "Reduce" diff --git a/lean-formalisation/lean-toolchain b/lean-formalisation/lean-toolchain new file mode 100644 index 0000000..be1fbc3 --- /dev/null +++ b/lean-formalisation/lean-toolchain @@ -0,0 +1 @@ +leanprover/lean4:v4.26.0-rc2 diff --git a/reduce.pdf b/reduce.pdf new file mode 100644 index 0000000..7ac856a Binary files /dev/null and b/reduce.pdf differ diff --git a/reduce.tex b/reduce.tex new file mode 100644 index 0000000..1a23297 --- /dev/null +++ b/reduce.tex @@ -0,0 +1,806 @@ +\documentclass{article} +\usepackage{amsmath, amssymb, amsthm} +\usepackage{mathpartir} +\usepackage{stmaryrd} +\usepackage{url} + +% Define big versions of oplus/ominus for folding notation +\newcommand{\bigominus}{\mathop{\text{\Large$\ominus$}}} + +\newtheorem{definition}{Definition} +\newtheorem{theorem}{Theorem} +\newtheorem{lemma}{Lemma} +\newtheorem{remark}{Remark} +\newtheorem{example}{Example} + +\title{A Formal Semantics for Skip's Reactive \texttt{reduce} Combinator} +\author{Cristiano Calcagno} +\date{November 2025} + +\begin{document} +\maketitle + +\begin{abstract} +We present a formal semantics for the \texttt{reduce} combinator in Skip~\cite{skip}, a reactive programming framework that maintains aggregated views of collections with automatic incremental updates. +Skip lets developers define aggregation logic declaratively as pure functions over collections, while the runtime handles event routing, cache invalidation, and state synchronization. +Skip exposes reducers as \textbf{first-class combinators}: a reducer $R = (\iota, \oplus, \ominus)$ is a reusable, composable value---not a callback tied to a specific engine context---that developers can apply uniformly across arbitrary reactive pipelines. +Unlike streaming systems (Flink, Kafka Streams) that provide add/remove callbacks bound to specific operators, and unlike FRP and incremental computation frameworks that hide inverses internally, Skip surfaces this algebraic structure as a user-facing programming construct. +We formalize Skip's correctness condition and prove that incremental correctness holds \emph{if and only if} the reducer is well-formed (i.e., $\ominus$ is the inverse of $\oplus$), giving Skip users a precise, user-facing specification for writing correct custom reducers. +\end{abstract} + +\section{Introduction} + +Skip~\cite{skip} is a reactive programming framework that maintains derived views of collections with automatic incremental updates. +When the underlying data changes, Skip efficiently propagates updates to all dependent computations without manual intervention. +A central operation in Skip is the \texttt{reduce} combinator, which computes a summary (or \emph{view}) for each key in a collection by folding over its associated values---for example, computing the sum, count, or minimum. + +The key to efficient updates is that Skip's \texttt{reduce} supports \emph{retractions}: when values are removed from a collection, the reducer can incrementally update the accumulated result rather than recomputing from scratch. +Skip exposes this capability through a \emph{reducer} abstraction $R = (\iota, \oplus, \ominus)$, where $\iota$ is an initial value, $\oplus$ is an add operation, and $\ominus$ is a remove operation. +This allows $O(1)$ updates per change, rather than $O(n)$ recomputation. + +\paragraph{Declarative programming model.} +Skip enables developers to write aggregation logic declaratively, composing reducers as pure, reusable values that express \emph{what} should be computed over collections. +The Skip runtime maintains derived views and propagates incremental updates, so developers need not reason about event routing, cache invalidation, or state synchronization. + +\paragraph{Skip's user-facing combinators.} +A distinguishing feature of Skip is that reducers are exposed as \textbf{user-facing combinators}---first-class programming constructs that developers use directly to build custom reactive services. +Rather than being internal implementation details hidden from users, Skip allows developers to define their own reducers for domain-specific aggregations. +Skip's documentation specifies an informal correctness condition: the result of applying the runtime's sequence of remove/add calls must equal recomputing from scratch. +This design enables extensibility (custom aggregations), composability (reducers combine with other Skip combinators), and---when the condition is satisfied---correctness guarantees. + +\paragraph{What distinguishes Skip's design.} +Three aspects distinguish Skip's reducer from related systems: +\begin{enumerate} + \item \textbf{First-class combinator, not callback.} Unlike Flink's \texttt{retract} methods or Kafka Streams' \texttt{Subtractor} interfaces---which are callbacks tied to specific engine contexts---Skip's reducer is a standalone, reusable value applicable to any reactive collection. This enables a declarative programming model where developers compose logic without infrastructure concerns; the reactive runtime handles event propagation and cache management transparently. + \item \textbf{User-facing inverse.} Unlike FRP libraries (which provide only append-style folds) and incremental computation frameworks (which handle inverses internally), Skip exposes $\ominus$ as an explicit user-defined operation. + \item \textbf{Formal correctness.} While prior work has studied invertible update operators and user-supplied inverses in more specialized settings, to our knowledge no mainstream streaming or reactive system provides a formal specification that application developers can apply when defining their own reducers; this paper fills that gap for Skip. +\end{enumerate} + +\paragraph{This paper.} +We formalize Skip's reducer abstraction and its correctness condition. +Our main result (Theorem~\ref{thm:equivalence}) proves that incremental correctness holds \emph{if and only if} the reducer is well-formed---that is, $\ominus$ is the inverse of $\oplus$. +This provides Skip users with a precise specification: satisfy the well-formedness condition, and your custom reducer is guaranteed to work correctly with Skip's incremental update mechanism. + +\paragraph{Synthesis of ideas.} +Our formalization builds on Skip's design and synthesizes ideas from multiple domains, which anchor the rest of the paper: +\begin{itemize} + \item From \textbf{incremental databases and streaming systems} (distributive, algebraic, holistic aggregates~\cite{viewmaintenance}), we carry over the aggregate-class taxonomy and view maintenance perspective, and the insight that invertibility yields $O(1)$ per-change maintenance for suitable aggregates. + \item From \textbf{Skip's reactive runtime}, we take the user-facing combinator contract: reducers are first-class, composable values whose invertibility the runtime can exploit, and which otherwise fall back safely to recompute. + \item From \textbf{formal methods}, we provide a complete characterization: not merely sufficient conditions for correctness, but a precise \emph{if-and-only-if} theorem that fully characterizes when Skip's incremental updates are correct. +\end{itemize} + +\paragraph{Contributions.} +This paper formalizes Skip's \texttt{reduce} combinator. We provide: +\begin{itemize} + \item A denotational semantics for \texttt{reduce} as a derived view (Section~\ref{sec:reduce}) + \item A formal model of deltas and Skip's incremental update procedure (Section~\ref{sec:incremental}) + \item A precise well-formedness condition and proof that it is both necessary and sufficient for correctness (Section~\ref{sec:correctness}) + \item A complexity contract: well-formed reducers admit $O(1)$ per-key updates, while partial reducers fall back to recomputation (Sections~\ref{sec:incremental} and~\ref{sec:examples}) + \item Concrete examples including sum, count, and min reducers (Section~\ref{sec:examples}) +\end{itemize} +We also position Skip's design relative to other streaming and reactive systems (Section~\ref{sec:related}) and analyze the complexity benefits of incremental updates. + +\section{Preliminaries}\label{sec:preliminaries} + +Let $K$ be a set of keys, $V$ a set of values, and $A$ a set of accumulator values. +For a set $V$, we write $\mathcal{M}(V)$ for the set of finite multisets over $V$; we use $\uplus$ and $\setminus$ for multiset union and multiset difference, respectively, and write $M \subseteq N$ for multisets when every element has multiplicity in $M$ less than or equal to its multiplicity in $N$. + +\begin{definition}[Collection] +A \emph{collection} is a function $C : K \to \mathcal{M}(V)$. We write $C(k)$ for the multiset of values associated with key $k$. +\end{definition} + +\begin{example}[Purchases per user] +As a running example, let $K = \{\mathit{user1}, \mathit{user2}\}$ and $V = \mathbb{Z}$. +We model purchases per user and care about which purchase amounts occurred, not about the order in which they happened. +A collection $C : K \to \mathcal{M}(V)$ might be given by +\[ +C(\mathit{user1}) = \{30,30,50\} +\quad\text{and}\quad +C(\mathit{user2}) = \{70\}. +\] +Here $C(\mathit{user1})$ is a multiset where $30$ has multiplicity $2$ and $50$ has multiplicity $1$, meaning that \emph{user1} made two purchases of \$30 and one purchase of \$50 (in some order). +This illustrates that multiple purchases can have the same amount, and that multisets record these multiplicities while abstracting away from order. +\end{example} + +\subsection{Commutative Operations} + +\begin{definition}[Pairwise Commutative Operation]\label{def:pairwise-commutative} +Let $\star : A \times V \to A$ be an update operation. +We say that $\star$ is \emph{pairwise commutative} if +\[ +\forall a \in A, v_1, v_2 \in V.\; (a \star v_1) \star v_2 = (a \star v_2) \star v_1. +\] +\end{definition} + +\subsection{Folds} + +\begin{definition}[Fold over Sequence for an Operation] +Let $\star : A \times V \to A$ be an update operation and let $s = [v_1,\dots,v_n]$ be a finite sequence of elements of $V$. +For any $a \in A$ we define: +\[ +\mathsf{fold}^{\mathsf{seq}}_\star(a, []) = a +\quad\text{and}\quad +\mathsf{fold}^{\mathsf{seq}}_\star(a, v_1 :: s') = \mathsf{fold}^{\mathsf{seq}}_\star(a \star v_1, s'). +\] +When a distinguished initial element $\iota \in A$ is understood from context, we write $\mathsf{fold}^{\mathsf{seq}}_\star(s)$ for $\mathsf{fold}^{\mathsf{seq}}_\star(\iota, s)$. +\end{definition} + +\begin{theorem}[Characterisation of Multiset Fold] +Let $\star : A \times V \to A$ be an update operation. +The following are equivalent: +\begin{enumerate} + \item For all $a \in A$, $M \in \mathcal{M}(V)$ and any two finite sequences $s_1, s_2$ enumerating $M$ (with multiplicity), we have + \[ + \mathsf{fold}^{\mathsf{seq}}_\star(a, s_1) = \mathsf{fold}^{\mathsf{seq}}_\star(a, s_2). + \] + That is, folding depends only on the multiset of elements, not on their enumeration. + \item The operation $\star$ is pairwise commutative in the sense of Definition~\ref{def:pairwise-commutative}. +\end{enumerate} +\end{theorem} + +\begin{proof} +Sketch: +For (2 $\Rightarrow$ 1), one shows first that swapping two adjacent elements in a sequence does not change the fold, using pairwise commutativity of $\star$. +Since any permutation of a finite sequence can be written as a product of adjacent transpositions, it follows that the fold depends only on the underlying multiset. +For (1 $\Rightarrow$ 2), instantiate (1) with the two sequences $[v_1,v_2]$ and $[v_2,v_1]$ enumerating the same multiset $\{v_1,v_2\}$, and expand the definition of the fold to obtain $(a \star v_1) \star v_2 = (a \star v_2) \star v_1$. +\end{proof} + +\begin{definition}[Fold over Multiset for an Operation] +Let $\star : A \times V \to A$ be pairwise commutative and let $M \in \mathcal{M}(V)$ be finite. +For $a \in A$ and any sequence $s$ enumerating $M$ (with multiplicity), we set +\[ +\mathsf{fold}_\star(a, M) := \mathsf{fold}^{\mathsf{seq}}_\star(a, s), +\] +which is well-defined by the Characterisation of Multiset Fold. +If an initial element $\iota \in A$ is fixed, we abbreviate $\mathsf{fold}_\star(M) := \mathsf{fold}_\star(\iota, M)$. +\end{definition} + +\begin{example}[Total spent over a multiset] +Continuing the purchase example, let $A = V = \mathbb{Z}$ and define $\star(a,v) = a + v$ with initial element $\iota = 0$. +For the multiset $M = \{30,30,50\}$ of purchases for \emph{user1} we have +\[ +\mathsf{fold}_\star(0, M) = 0 + 30 + 30 + 50 = 110, +\] +regardless of the order in which the elements of $M$ are enumerated. +\end{example} + +\begin{lemma}[Fold over Union of Multisets] +Let $\star : A \times V \to A$ be pairwise commutative and let $M, N \in \mathcal{M}(V)$ be finite multisets. +Then for all $a \in A$: +\[ +\mathsf{fold}_\star(a, M \uplus N) = \mathsf{fold}_\star(\mathsf{fold}_\star(a, M), N). +\] +\end{lemma} + +\begin{proof} +Choose an enumeration of $M \uplus N$ in which all elements of $M$ appear first, followed by all elements of $N$. +The result then follows immediately from the definition of $\mathsf{fold}^{\mathsf{seq}}_\star$ and the fact that $\mathsf{fold}_\star$ is independent of the particular enumeration. +\end{proof} + +\begin{example}[Decomposing total spend] +With $\star(a,v) = a+v$ and $\iota = 0$, suppose we split \emph{user1}'s purchases as $M = \{30\}$ and $N = \{30,50\}$ so that $M \uplus N = \{30,30,50\}$. +Then +\[ +\mathsf{fold}_\star(0, M \uplus N) + = 0 + 30 + 30 + 50 + = \mathsf{fold}_\star(\mathsf{fold}_\star(0, M), N), +\] +showing that we can compute the total spent either directly on $M \uplus N$ or by first aggregating over $M$ and then incrementally folding in the purchases in $N$. +\end{example} + +\section{The Reduce Combinator}\label{sec:reduce} + +The \texttt{reduce} combinator produces a \emph{view} of a collection by summarizing the values for each key. + +\begin{definition}[Reduce Combinator] +Let $\oplus : A \times V \to A$ be a pairwise commutative operation and $\iota \in A$ an initial value. +Given a collection $C : K \to \mathcal{M}(V)$, the \emph{reduce} combinator produces a view: +\[ +\mathsf{reduce}_{\iota,\oplus}(C) : K \to A +\] +defined as: +\[ +\mathsf{reduce}_{\iota,\oplus}(C)(k) = \mathsf{fold}_\oplus(\iota, C(k)) +\] +That is, for each key $k$, we fold the operation $\oplus$ over all values in $C(k)$, starting from $\iota$. +\end{definition} + +\begin{example}[Per-user analytics as reduce] +For the purchase collection $C$ above, two common choices of $\oplus$ are: +\begin{itemize} + \item the \emph{sum} operation $\oplus_{\mathsf{sum}}(a,v) = a + v$ with $\iota_{\mathsf{sum}} = 0$, giving each user their total spend; + \item the \emph{count} operation $\oplus_{\mathsf{count}}(a,v) = a + 1$ with $\iota_{\mathsf{count}} = 0$, giving each user their number of purchases. +\end{itemize} +Then, for example, +\[ +\mathsf{reduce}_{\iota_{\mathsf{sum}},\oplus_{\mathsf{sum}}}(C)(\mathit{user1}) = 110, +\qquad +\mathsf{reduce}_{\iota_{\mathsf{count}},\oplus_{\mathsf{count}}}(C)(\mathit{user1}) = 3. +\] +\end{example} + +The view $\mathsf{reduce}_{\iota,\oplus}(C)$ is a derived collection that depends on $C$. +When $C$ changes, the view must be updated to remain consistent. +The next section addresses how to perform these updates efficiently. + +\section{Incremental Updates}\label{sec:incremental} + +When a collection $C$ changes, the view $\mathsf{reduce}_{\iota,\oplus}(C)$ must be updated. +A na\"ive approach would recompute the fold from scratch for each affected key, requiring $O(n)$ time where $n$ is the size of the multiset. +To achieve $O(1)$ updates, we introduce a \emph{remove} operation $\ominus$ that can undo the effect of $\oplus$. + +\subsection{Reducers} + +\begin{definition}[Reducer] +A \emph{reducer} is a triple $R = (\iota, \oplus, \ominus)$ where $\iota \in A$ is an initial value, and +\[ +\oplus, \ominus : A \times V \to A +\] +are update operations such that both $\oplus$ and $\ominus$ are pairwise commutative in the sense of Definition~\ref{def:pairwise-commutative}. +We call $\oplus$ the \emph{add} operation and $\ominus$ the \emph{remove} operation. +\end{definition} + +For a reducer $R = (\iota, \oplus, \ominus)$, we write $\mathsf{reduce}_R$ for $\mathsf{reduce}_{\iota,\oplus}$. + +\begin{definition}[Well-Formed Reducer]\label{def:well-formed-reducer} +A reducer $R = (\iota, \oplus, \ominus)$ is \emph{well-formed} if $\ominus$ is the \textbf{inverse} of $\oplus$ on reachable accumulator values, that is, for all finite multisets $M \in \mathcal{M}(V)$ and all $v \in V$: +\[ +( \mathsf{fold}_\oplus(\iota, M) \oplus v) \ominus v = \mathsf{fold}_\oplus(\iota, M) +\] +\end{definition} + +\begin{example}[Interpreting well-formedness for sum] +For $R_{\mathsf{sum}}$ and any multiset $M$ of purchase amounts and value $v \in \mathbb{Z}$, well-formedness says that if we start from the total spend $\mathsf{fold}_\oplus(0, M)$, then adding a purchase of amount $v$ and immediately removing it leaves the total unchanged: +\[ +\big(\mathsf{fold}_\oplus(0, M) + v\big) - v = \mathsf{fold}_\oplus(0, M). +\] +This is exactly the intuitive requirement that $\ominus$ undo the effect of $\oplus$ on reachable accumulator states. +\end{example} + +In database terminology, a well-formed reducer defines an \emph{invertible distributive aggregate} (see Section~\ref{subsec:aggregate-classes}): the fold can be computed over partitions independently (distributive), and individual values can be removed from the accumulated result (invertible). + +\begin{remark}[Remove-Add Commutativity] +For well-formed reducers where $\oplus$ and $\ominus$ arise from an abelian group action on $A$, the following property holds automatically: +\[ +\forall a \in A, v_1, v_2 \in V.\; (a \ominus v_1) \oplus v_2 = (a \oplus v_2) \ominus v_1 +\] +This ensures that the order of interleaved adds and removes does not affect the final result. +All practical reducers (sum, count, product over commutative groups) satisfy this. +\end{remark} + +\subsection{Aggregate Classes}\label{subsec:aggregate-classes} + +The database literature~\cite{viewmaintenance} classifies aggregates as \emph{distributive}, \emph{algebraic}, or \emph{holistic}. +In our setting, a pair $(\iota, \oplus)$ defines a \emph{distributive aggregate} when folding over a union of multisets can be decomposed into folds over the parts. + +\begin{definition}[Distributive Aggregate] +Let $\oplus : A \times V \to A$ be pairwise commutative and $\iota \in A$. +We say that $(\iota, \oplus)$ is a \emph{distributive aggregate} if for all finite multisets $M, N \in \mathcal{M}(V)$: +\[ +\mathsf{fold}_\oplus(\iota, M \uplus N) = \mathsf{fold}_\oplus\big(\mathsf{fold}_\oplus(\iota, M), N\big). +\] +\end{definition} + +By Lemma~1, any pair $(\iota, \oplus)$ with pairwise commutative $\oplus$ is a distributive aggregate in this sense. +Moreover, a well-formed reducer $R = (\iota, \oplus, \ominus)$ (Definition~\ref{def:well-formed-reducer}) is precisely an \emph{invertible distributive aggregate}: the aggregate is distributive over partitions of the multiset, and individual contributions can be removed using $\ominus$. + +The database literature also defines \emph{algebraic} aggregates~\cite{viewmaintenance}: aggregates that can be computed by maintaining a fixed number of distributive aggregates and post-processing their results. +For example, average is algebraic because it can be computed from the distributive aggregates sum and count via division. +In Skip, this corresponds to using a well-formed reducer with richer accumulator state (e.g., $(sum, count)$ pairs) followed by a pointwise mapper to extract the final value. +We illustrate this pattern in Section~\ref{sec:examples}. + +Finally, \emph{holistic} aggregates~\cite{viewmaintenance} cannot be computed from bounded intermediate state---they potentially require access to the entire multiset. +Examples include: +\begin{itemize} + \item \textbf{MEDIAN}: needs the full distribution to pick the middle value(s) + \item \textbf{QUANTILES/PERCENTILES}: similar to median, require global ordering information + \item \textbf{RANK}: depends on the position of a value within the full sorted dataset +\end{itemize} +For holistic aggregates, any exact incremental solution must maintain auxiliary state that grows with the data (e.g., the entire multiset or an order-statistic tree) in order to answer updates and queries. +Skip can of course support such analyses by using richer data structures or approximations (e.g., quantile sketches), but these fall outside the constant-space, purely algebraic reducer model we formalize in this paper. + +\subsection{Deltas} + +We model updates to collections as deltas. + +\begin{definition}[Delta] +A \emph{delta} $\Delta$ for a collection $C$ is a pair $(\Delta^+, \Delta^-)$ where: +\begin{itemize} + \item $\Delta^+ : K \to \mathcal{M}(V)$ represents added values + \item $\Delta^- : K \to \mathcal{M}(V)$ represents removed values +\end{itemize} +We require that $\Delta^-(k) \subseteq C(k)$ for all $k$ (i.e., we can only remove values that exist in the collection). +\end{definition} + +\begin{definition}[Delta Application] +Given a collection $C$ and a delta $\Delta = (\Delta^+, \Delta^-)$ for $C$, the updated collection $C \bullet \Delta$ is defined pointwise by: +\[ +C \bullet \Delta = \lambda k.\, (C(k) \setminus \Delta^-(k)) \uplus \Delta^+(k) +\] +Intuitively, $C \bullet \Delta$ is the collection obtained by first removing all values in $\Delta^-$ from $C$ and then adding all values in $\Delta^+$. +\end{definition} + +\begin{example}[Changing a user's purchases] +For key $k = \mathit{user1}$ in the purchase collection $C$, suppose we remove one \$30 purchase and add a new \$20 purchase. +This yields a delta with +\[ +\Delta^-(k) = \{30\},\qquad \Delta^+(k) = \{20\} +\] +and updated multiset +\[ +C'(k) = (C(k) \setminus \{30\}) \uplus \{20\} = \{30,50,20\}, +\] +which again records all purchases for \emph{user1} but with one order-irrelevant change in their amounts. +\end{example} + +\begin{remark}[Operational Construction of Deltas] +In the Skip runtime, for each key $k$ we compute an \emph{old} multiset $\mathit{old}(k)$ of contributing values and a \emph{new} multiset $\mathit{new}(k)$. The delta is then constructed as: +\[ +\Delta^+(k) = \mathit{new}(k) \setminus \mathit{old}(k) +\quad\text{and}\quad +\Delta^-(k) = \mathit{old}(k) \setminus \mathit{new}(k). +\] +Note that $\Delta^+$ and $\Delta^-$ are disjoint by construction. +If $C_{\mathit{old}}$ is the collection with $C_{\mathit{old}}(k) = \mathit{old}(k)$, then $\Delta^-(k) \subseteq C_{\mathit{old}}(k)$, so $\Delta$ is a valid delta for $C_{\mathit{old}}$. +Moreover, $C_{\mathit{old}} \bullet \Delta = C_{\mathit{new}}$ where $C_{\mathit{new}}(k) = \mathit{new}(k)$. +\end{remark} + +\subsection{Incremental Update} + +\begin{definition}[Incremental Reduce] +Given the current accumulator value $a_k$ for key $k$ and a delta $\Delta$, the new accumulator is computed as: +\[ +\mathsf{update}_R(a_k, \Delta, k) = + \mathsf{fold}_\oplus\big( + \mathsf{fold}_\ominus(a_k, \Delta^-(k)),\; + \Delta^+(k) + \big) +\] +That is, we first apply all removals $\Delta^-(k)$ to $a_k$ using $\ominus$, and then apply all additions $\Delta^+(k)$ using $\oplus$. +This is well-defined since $\oplus$ and $\ominus$ are pairwise commutative. +Here $\mathsf{fold}_\ominus$ is the multiset fold induced by $\ominus$, defined exactly as in Section~\ref{sec:preliminaries}; pairwise commutativity of $\ominus$ guarantees it is well-defined. +\end{definition} + +\begin{example}[Updating a user's total spend] +Let $R_{\mathsf{sum}}$ be the sum reducer and consider again $k = \mathit{user1}$ with +\[ +C(k) = \{30,30,50\}, \qquad a_k = \mathsf{reduce}_{R_{\mathsf{sum}}}(C)(k) = 110. +\] +Applying the delta above (removing \$30, adding \$20) gives +\[ +\mathsf{update}_{R_{\mathsf{sum}}}(a_k, \Delta, k) + = \mathsf{fold}_\oplus(\mathsf{fold}_\ominus(110, \{30\}), \{20\}) + = (110 - 30) + 20 + = 100, +\] +which matches the total obtained by recomputing from scratch on the updated multiset $C'(k) = \{30,50,20\}$. +\end{example} + +\section{Correctness}\label{sec:correctness} + +We now characterize exactly when incremental updates are correct. +At some moment we have an \emph{old} collection $C$ and, for each key $k$, an accumulator +\[ +a_k = \mathsf{reduce}_R(C)(k) +\] +that agrees with the denotational semantics. +A change to the collection is described abstractly by a delta $\Delta$, yielding the \emph{updated} collection +\[ +C' = C \bullet \Delta. +\] +For each key $k$ there are then two ways to obtain the \emph{new} accumulator value: +\begin{itemize} + \item \emph{Denotational recompute:} ignore $a_k$ and compute + \[ + a'_k = \mathsf{reduce}_R(C')(k), + \] + i.e.\ start from $\iota$ and fold $\oplus$ over the current multiset $C'(k)$. + \item \emph{Incremental update:} update the old accumulator $a_k$ using the delta by + \[ + a'_k = \mathsf{update}_R(a_k, \Delta, k), + \] + i.e.\ first remove $\Delta^-(k)$ using $\ominus$, then add $\Delta^+(k)$ using $\oplus$. +\end{itemize} + +\begin{definition}[Incremental Correctness Property] +A reducer $R$ satisfies the \emph{incremental correctness property} if for all collections $C$, all valid deltas $\Delta$ for $C$, and all keys $k$: +\[ +\mathsf{reduce}_R(C \bullet \Delta)(k) = \mathsf{update}_R(\mathsf{reduce}_R(C)(k), \Delta, k) +\] +\end{definition} + +\begin{example}[Sum as a sanity check] +For the sum reducer $R_{\mathsf{sum}}$, the incremental correctness property simply formalizes the familiar claim that ``subtracting the amounts of removed purchases and adding the amounts of new purchases yields the same total spend as recomputing the sum from scratch'' for every user and every delta. +\end{example} + +\begin{lemma}[Cancellation for a Delta]\label{lem:delta-cancels} +Let $R = (\iota, \oplus, \ominus)$ be a well-formed reducer and $C$ a collection. +For any valid delta $\Delta = (\Delta^+, \Delta^-)$ for $C$ and key $k$, writing $M = C(k)$ and $M_0 = M \setminus \Delta^-(k)$, we have: +\[ +\mathsf{fold}_\ominus\big(\mathsf{fold}_\oplus(\iota, M), \Delta^-(k)\big) = \mathsf{fold}_\oplus(\iota, M_0). +\] +\end{lemma} + +\begin{proof} +Write $D = \Delta^-(k)$. +We proceed by induction on the size of the multiset $D$. +If $D = \emptyset$, then $\mathsf{fold}_\ominus(\mathsf{fold}_\oplus(\iota,M), D) = \mathsf{fold}_\oplus(\iota,M)$ by definition, and $M_0 = M$, so the claim holds. + +For the inductive step, suppose $D = D' \uplus \{v\}$ for some value $v$ and sub-multiset $D' \subseteq M$, and assume the statement holds for $D'$. +Since $M = M_0 \uplus D = M_0 \uplus D' \uplus \{v\}$ and $\oplus$ is pairwise commutative, Lemma~1 (fold over union of multisets) gives: +\[ +\mathsf{fold}_\oplus(\iota,M) + = \mathsf{fold}_\oplus\big(\mathsf{fold}_\oplus(\iota,M_0 \uplus D'), \{v\}\big) + = \mathsf{fold}_\oplus(\iota,M') \oplus v +\] +for the multiset $M' = M_0 \uplus D'$. +Applying well-formedness with multiset $M'$ and value $v$ yields +\[ +(\mathsf{fold}_\oplus(\iota,M') \oplus v) \ominus v = \mathsf{fold}_\oplus(\iota,M'). +\] +Since $\mathsf{fold}_\oplus(\iota,M) = \mathsf{fold}_\oplus(\iota,M') \oplus v$, we have: +\[ +\mathsf{fold}_\ominus(\mathsf{fold}_\oplus(\iota,M), \{v\}) + = (\mathsf{fold}_\oplus(\iota,M') \oplus v) \ominus v + = \mathsf{fold}_\oplus(\iota,M'). +\] +Now $M' \setminus D' = (M_0 \uplus D') \setminus D' = M_0$, so by the induction hypothesis applied to $M'$ and $D'$ we obtain +\[ +\mathsf{fold}_\ominus\big(\mathsf{fold}_\oplus(\iota,M'), D'\big) + = \mathsf{fold}_\oplus(\iota,M' \setminus D') + = \mathsf{fold}_\oplus(\iota,M_0). +\] +Since $\ominus$ is pairwise commutative, Lemma~1 applies to $\ominus$ as well, giving: +\[ +\mathsf{fold}_\ominus(\mathsf{fold}_\oplus(\iota,M), D) + = \mathsf{fold}_\ominus(\mathsf{fold}_\ominus(\mathsf{fold}_\oplus(\iota,M), \{v\}), D'), +\] +and using the equalities above, +\[ +\mathsf{fold}_\ominus(\mathsf{fold}_\ominus(\mathsf{fold}_\oplus(\iota,M), \{v\}), D') + = \mathsf{fold}_\ominus(\mathsf{fold}_\oplus(\iota,M'), D'), +\] +and finally +\[ +\mathsf{fold}_\ominus(\mathsf{fold}_\oplus(\iota,M'), D') = \mathsf{fold}_\oplus(\iota,M_0), +\] +which completes the proof. +\end{proof} + +\begin{example}[Delta cancellation for total spend] +Continuing the purchase example, let $R_{\mathsf{sum}}$ be the sum reducer and consider key $k = \mathit{user1}$ with +\[ +C(k) = \{30,30,50\}. +\] +Suppose we remove one \$30 purchase and keep the other two purchases unchanged, so +\[ +\Delta^-(k) = \{30\}, \qquad \Delta^+(k) = \emptyset, +\] +and hence +\[ +M = C(k), \quad D = \Delta^-(k), \quad M_0 = M \setminus D = \{30,50\}. +\] +Lemma~\ref{lem:delta-cancels} states that +\[ +\mathsf{fold}_\ominus\big(\mathsf{fold}_\oplus(0, M), D\big) + = \mathsf{fold}_\oplus(0, M_0). +\] +Indeed, $\mathsf{fold}_\oplus(0, M) = 0 + 30 + 30 + 50 = 110$, so +\[ +\mathsf{fold}_\ominus(110, \{30\}) = 110 - 30 = 80, +\] +while +\[ +\mathsf{fold}_\oplus(0, M_0) = 0 + 30 + 50 = 80. +\] +Thus incrementally removing the multiset of cancelled purchases yields exactly the total spend over the remaining purchases. +\end{example} + +The following theorem shows that the inverse property is both necessary and sufficient for incremental correctness. + +\begin{theorem}[Characterization of Incremental Correctness]\label{thm:equivalence} +Let $R = (\iota, \oplus, \ominus)$ be a reducer (with pairwise commutative $\oplus$ and $\ominus$). +The following are equivalent: +\begin{enumerate} + \item $R$ is well-formed in the sense of Definition~\ref{def:well-formed-reducer}. + \item $R$ satisfies the incremental correctness property. +\end{enumerate} +\end{theorem} + +\begin{proof} +We prove both directions. + +\medskip +\noindent\textbf{(1 $\Rightarrow$ 2): Well-formedness implies correctness.} + +Assume $R$ is well-formed. Let $C$ be a collection, $\Delta = (\Delta^+, \Delta^-)$ a valid delta for $C$, and $k$ a key. +Write $M = C(k)$ for the old multiset, $M' = C'(k) = (M \setminus \Delta^-(k)) \uplus \Delta^+(k)$ for the new multiset, and $a = \mathsf{fold}_\oplus(\iota, M)$ for the old accumulator. + +We must show $\mathsf{fold}_\oplus(\iota, M') = \mathsf{fold}_\oplus(\mathsf{fold}_\ominus(a, \Delta^-(k)), \Delta^+(k))$. + +Since $\Delta^-(k) \subseteq M$, we can write $M = M_0 \uplus \Delta^-(k)$ for some multiset $M_0$. +By pairwise commutativity of $\oplus$: +\[ +a = \mathsf{fold}_\oplus(\iota, M) = \mathsf{fold}_\oplus(\iota, M_0 \uplus \Delta^-(k)) = \mathsf{fold}_\oplus(\mathsf{fold}_\oplus(\iota, M_0), \Delta^-(k)) +\] + +Let $a_0 = \mathsf{fold}_\oplus(\iota, M_0)$. +Then $a = \mathsf{fold}_\oplus(a_0, \Delta^-(k))$. + +By Lemma~\ref{lem:delta-cancels} we have +\[ +\mathsf{fold}_\ominus(a, \Delta^-(k)) = \mathsf{fold}_\oplus(\iota, M_0) = a_0. +\] + +Therefore: +\[ +\mathsf{fold}_\oplus(\mathsf{fold}_\ominus(a, \Delta^-(k)), \Delta^+(k)) = \mathsf{fold}_\oplus(a_0, \Delta^+(k)) +\] + +Since $M' = M_0 \uplus \Delta^+(k)$: +\[ +\mathsf{fold}_\oplus(\iota, M') = \mathsf{fold}_\oplus(\mathsf{fold}_\oplus(\iota, M_0), \Delta^+(k)) = \mathsf{fold}_\oplus(a_0, \Delta^+(k)) +\] + +Thus both sides are equal. + +\medskip +\noindent\textbf{(2 $\Rightarrow$ 1): Correctness implies well-formedness.} + +Assume $R$ satisfies the incremental correctness property. +We must show $(a \oplus v) \ominus v = a$ for all $a \in A$ and $v \in V$. + +Fix $a \in A$ and $v \in V$. +We first establish the property for $a$ of the form $a = \mathsf{fold}_\oplus(\iota, M)$ for some multiset $M$. + +Define a collection $C$ with a single key $k$ where $C(k) = M \uplus \{v\}$. +Then: +\[ +\mathsf{reduce}_R(C)(k) = \mathsf{fold}_\oplus(\iota, M \uplus \{v\}) = \mathsf{fold}_\oplus(\mathsf{fold}_\oplus(\iota, M), \{v\}) = a \oplus v +\] + +Define delta $\Delta$ with $\Delta^-(k) = \{v\}$ and $\Delta^+(k) = \emptyset$. +This is valid since $v \in C(k)$. +The updated collection is $C'$ with $C'(k) = M$. + +By the incremental correctness property: +\[ +\mathsf{reduce}_R(C')(k) = \mathsf{update}_R(\mathsf{reduce}_R(C)(k), \Delta, k) +\] + +The left side is: +\[ +\mathsf{reduce}_R(C')(k) = \mathsf{fold}_\oplus(\iota, M) = a +\] + +The right side is: +\[ +\mathsf{update}_R(a \oplus v, \Delta, k) = \mathsf{fold}_\oplus(\mathsf{fold}_\ominus(a \oplus v, \{v\}), \emptyset) = (a \oplus v) \ominus v +\] + +Therefore $(a \oplus v) \ominus v = a$. + +This establishes the inverse property for all $a$ of the form $\mathsf{fold}_\oplus(\iota, M)$, i.e.\ for all reachable accumulator values, which is exactly the condition in Definition~\ref{def:well-formed-reducer}. +\end{proof} + +\begin{remark} +In many practical reducers (for example sum over integers or product over rationals), every accumulator value is reachable as $\mathsf{fold}_\oplus(\iota, M)$ for some multiset $M$, so the definition of well-formedness above coincides with the simpler global inverse law $(a \oplus v) \ominus v = a$ for all $a \in A$, $v \in V$. +\end{remark} + +\begin{remark}[Partial Reducers in Skip] +Theorem~\ref{thm:equivalence} characterizes when incremental updates are correct for well-formed reducers. +In Skip's concrete API, the remove operation is allowed to be \emph{partial}: a reducer's remove function can signal ``recompute from scratch'' (by returning \texttt{None} in the ReScript bindings) instead of producing an updated accumulator. +Such partial reducers handle cases where $\ominus$ cannot efficiently invert $\oplus$---for example, computing the minimum without maintaining auxiliary state. +The runtime responds by recomputing the fold from scratch for that key. +Examples of partial reducers appear in Section~\ref{sec:examples}. +\end{remark} + +\section{Examples}\label{sec:examples} + +\subsection{Sum Reducer} + +\[ +R_{\mathsf{sum}} = (0, \lambda(a,v).\, a + v, \lambda(a,v).\, a - v) +\] + +This reducer is well-formed: addition is commutative, and subtraction is the inverse of addition. + +\paragraph{Worked example.} +Suppose for key $k$ we have $C(k) = \{3, 5, 7\}$, so the current view is $a_k = 0 + 3 + 5 + 7 = 15$. +Now suppose value $5$ is removed and value $2$ is added, giving delta $\Delta^-(k) = \{5\}$ and $\Delta^+(k) = \{2\}$. +The incremental update computes: +\[ +a'_k = (15 - 5) + 2 = 12 +\] +This matches a full recompute: $0 + 3 + 7 + 2 = 12$. + +\subsection{Count Reducer} + +\[ +R_{\mathsf{count}} = (0, \lambda(a,v).\, a + 1, \lambda(a,v).\, a - 1) +\] + +This reducer counts the number of values, ignoring their content. +It is well-formed since $(a+1)-1 = a$. + +\paragraph{Worked example.} +For $C(k) = \{x, y, z\}$, we have $a_k = 3$. +If $y$ is removed ($\Delta^- = \{y\}$) and $w$ is added ($\Delta^+ = \{w\}$), then: +\[ +a'_k = (3 - 1) + 1 = 3 +\] + +\subsection{Average Reducer (Algebraic)} + +The average is a classic example of an \emph{algebraic} aggregate~\cite{viewmaintenance}: it can be expressed as a post-processing of a distributive aggregate over richer state. +A standard encoding uses accumulator state $A = \mathbb{R} \times \mathbb{N}$ to track sum and count. + +Define: +\[ +R_{\mathsf{avgState}} = \big((0,0),\ \lambda((s,c),v).\, (s+v, c+1),\ \lambda((s,c),v).\, (s-v, c-1)\big) +\] +with accumulator state $A = \mathbb{R} \times \mathbb{N}$ tracking (sum, count). +On reachable states with $c > 0$, the corresponding average is $\mathsf{avg} = s / c$; when $c = 0$ (empty multiset), the view can be defined as a designated ``no value'' (e.g., \texttt{None}) or $0$ depending on the application. +This reducer is well-formed: addition and subtraction on sum and increment/decrement on count satisfy the inverse law on all reachable accumulator states, so Theorem~\ref{thm:equivalence} applies. +In Skip, the average view can be implemented by first using $\mathsf{reduce}_{R_{\mathsf{avgState}}}$ and then applying a pointwise mapper that divides sum by count for each key. + +Note that maintaining \emph{only} the average (without count) is insufficient: to update the average when adding a value, one needs to know how many values contributed to the current average. +Thus, average is genuinely an algebraic aggregate requiring auxiliary state, unlike sum or count which can be maintained with a single accumulator value. + +\subsection{Min Reducer (Partial)} + +The min reducer demonstrates why invertibility is essential for incremental updates. +With accumulator $A = \mathbb{R} \cup \{+\infty\}$, the add operation is: +\[ +\iota = +\infty, \quad \oplus = \lambda(a,v).\, \min(a,v) +\] + +However, there is no inverse operation $\ominus$ that works in general. +Consider $C(k) = \{3, 5\}$ with $a_k = 3$. +\begin{itemize} + \item If we remove $5$: we need $a'_k = 3$. We could define $(3 \ominus 5) = 3$ (removing a non-minimum has no effect). + \item If we remove $3$: we need $a'_k = 5$. But from $a_k = 3$ alone, we cannot know that $5$ was the second-smallest value! +\end{itemize} + +This shows min is \emph{not} an invertible distributive aggregate: knowing only the accumulated minimum is insufficient to update the result when the minimum itself is removed. + +In Skip's implementation, min is handled as a \emph{partial reducer}: +\begin{itemize} + \item The remove function signals ``cannot update incrementally'' (e.g., returns \texttt{None}) + \item The runtime responds by recomputing from scratch: $\mathsf{fold}_{\min}(\iota, C'(k))$ + \item Alternatively, one can maintain richer state (e.g., a sorted multiset of all values), making the remove operation invertible on that richer state---but this is no longer a constant-space reducer +\end{itemize} + +\section{Related Work}\label{sec:related} + +The problem of efficiently maintaining aggregations over changing data has been studied extensively across streaming systems, reactive programming frameworks, and incremental computation. +We organize our discussion around a key design dimension: \emph{what operational concerns must developers understand and manage?} +Skip's design philosophy---infrastructure abstraction paired with exposed algebraic structure---positions it within a broader movement toward declarative programming models where developers express domain logic without infrastructure concerns~\cite{zedlewski2025eventhidden}. + +\paragraph{Infrastructure-exposed streaming systems.} +Traditional streaming systems require developers to understand and work with operational concerns: how data flows through pipelines, when windows trigger, how state is partitioned, and how retractions are processed. +These systems expose infrastructure complexity, forcing developers to think about event propagation, state management, and update mechanisms. + +\emph{Apache Flink} (Table API) supports user-defined aggregate functions via classes with methods such as \texttt{createAccumulator}, \texttt{accumulate}, and (for streaming queries that receive updates) \texttt{retract}. +These objects are reusable within Flink pipelines, but they are tied to Flink's table runtime and changelog/retraction model; the interface does not present a standalone reducer triple $R = (\iota, \oplus, \ominus)$ with an explicit invertibility law. +\emph{Apache Kafka Streams} provides aggregation over KTables by passing two callbacks to \texttt{aggregate}: one to add new values and one to remove old values when keys are updated or deleted. +These callbacks are defined as part of a particular topology and bound to KTable's update mechanism, rather than as reusable reducer values with a user-facing correctness contract. +\emph{Esper CEP} supports \texttt{enter}/\texttt{leave} callbacks and delivers ``new'' and ``old'' results for sliding windows, but the engine itself implements aggregation and retractions; developers react to removals in listeners rather than defining a separate inverse operation. +Systems like Microsoft's \emph{Trill}~\cite{trill} similarly let users define custom aggregations with explicit accumulate and de-accumulate functions, but these remain tied to the streaming query's execution and lack any user-facing formal correctness conditions. +In all of these systems, add/remove logic is supplied as configuration for specific operators within a streaming engine, not as a general-purpose, portable combinator abstraction. +Moreover, these systems expose their event-driven, retraction-based architectures; developers must understand how retractions propagate, when state is synchronized, and how correctness is ensured. + +\emph{Apache Beam} and \emph{Spark Streaming} take a different approach: they hide user-defined inverse operations entirely, handling retractions internally via recomputation or diffing. +While this simplifies the developer interface, it removes the ability to express domain-specific inverse operations, and developers must still reason about operational concerns like windowing, triggering, and state management. + +\paragraph{Infrastructure-hidden, structure-hidden systems.} +At the other extreme, some systems hide both infrastructure complexity and the underlying algebraic structure, working at higher levels of abstraction. + +\emph{Differential dataflow}~\cite{differentialdataflow} and \emph{DBSP}~\cite{dbsp} provide rigorous foundations for incremental computation using Z-sets and abelian groups. +These frameworks internally ensure that all built-in aggregations have inverses, but users typically work at the level of SQL queries or dataflow graphs without explicitly defining $\oplus$ and $\ominus$. +\emph{Materialize}~\cite{materialize} is a streaming SQL database built on differential dataflow that exposes incremental view maintenance via SQL, while hiding the underlying use of differential updates. +The algebraic structure exists but is hidden; users cannot define custom reducers that exploit invertibility for domain-specific aggregations. + +\emph{Functional reactive programming} libraries (Fran, Yampa, Reactive Banana~\cite{frp}) and UI-oriented frameworks (React, Vue, Svelte, Solid) encapsulate event routing and dependency tracking, exposing declarative combinators like map, filter, and scan/foldp. +State evolution is typically described as folds over event streams (e.g., \texttt{foldp}, \texttt{scan}, \texttt{accum}), but these folds are opaque to the runtime: there is no distinguished remove operation or notion of an invertible reducer. +As a result, the frameworks can optimize propagation and caching, but not user-specified algebraic inverses for efficient handling of removals. + +\emph{Incremental computation} frameworks (Adapton, Jane Street's Incremental) hide infrastructure complexity via dependency tracking, but also hide the inverse operation; removal is handled internally through dependency graphs rather than explicit algebraic inverses. + +\paragraph{Skip's approach: infrastructure-hidden, structure-exposed.} +Skip's design occupies a distinctive position among these systems: it hides infrastructure complexity while exposing algebraic structure directly to application developers. +Developers write aggregation logic declaratively as pure functions over collections, without thinking about event routing, cache invalidation, or state synchronization; the reactive runtime handles these operational concerns automatically. +At the same time, Skip exposes the algebraic structure of reducers as first-class combinators: developers define $\oplus$ and $\ominus$ explicitly, enabling domain-specific aggregations with formal correctness guarantees. +Several prior systems require user-supplied inverse functions or define formal minus/change operators in their semantic models, but Skip combines a user-facing reducer triple, reuse across multiple reactive contexts, and a formal correctness theorem that connects this triple to the runtime's operational behavior. + +This design aligns with a broader trend toward ``event-hidden architectures'' seen in modern distributed systems~\cite{zedlewski2025eventhidden}. +Just as SQL databases hide query execution plans while exposing relational algebra, and just as React hides DOM manipulation while exposing declarative components, Skip hides event queues and reactive runtime internals while exposing mathematical combinators. +The runtime remains event-driven under the covers, but developers work at the level of what to compute, not how events propagate. + +Skip's reducer $R = (\iota, \oplus, \ominus)$ is a first-class combinator---a reusable, composable value applicable to any reactive collection via $\mathsf{reduce}_R$, not a callback tied to specific engine contexts. +This enables developers to define custom aggregations (e.g., domain-specific metrics, business logic) that compose naturally with other Skip combinators, while the runtime automatically handles incremental updates, dependency tracking, and cache management. + +\paragraph{Database view maintenance and aggregate classification.} +The database literature classifies aggregates as \emph{distributive}, \emph{algebraic}, or \emph{holistic}~\cite{viewmaintenance}. +Skip's well-formed reducers correspond to \emph{invertible distributive aggregates}: the fold can be computed over partitions independently (distributive), and individual contributions can be removed using the inverse operation (invertible). +In the database literature, such aggregates are often described as \emph{self-maintainable} or invertible distributive aggregates~\cite{viewmaintenance}. +Tangwongsan et al.~\cite{slidingwindow} note that ``prior work often relies on aggregation functions to be invertible'' for efficient sliding-window maintenance. +Yin et al.~\cite{incgraph} require inverse functions for incremental graph aggregation. +These works focus on algorithmic techniques; our contribution is to formalize correctness for Skip's user-facing abstraction, providing developers with a precise specification for writing correct custom reducers. + +\paragraph{Our formal contribution.} +Skip's documentation specifies an informal correctness condition for user-defined reducers: the result of applying the runtime's sequence of remove/add calls must equal recomputing from scratch. +Our contribution is to: +\begin{enumerate} + \item Formalize this correctness condition as a well-formedness property + \item Prove that correctness holds \emph{if and only if} the property is satisfied (Theorem~\ref{thm:equivalence}) + \item Connect Skip's design to the theory of invertible distributive aggregates +\end{enumerate} +This gives Skip users a precise specification for writing correct custom reducers, backed by a complete formal characterization. +The well-formedness condition is a clear, local specification: developers define $\oplus$ and $\ominus$ explicitly and reason about whether $(a \oplus v) \ominus v = a$ holds for all reachable accumulator states $a$, rather than about the runtime's operational behavior. + +\begin{thebibliography}{12} + +\bibitem{skip} +Skip Team. +\textit{Skip: A Reactive Programming Framework}. +\url{https://github.com/SkipLabs/skip}, 2024. + +\bibitem{differentialdataflow} +Frank McSherry, Derek G.\ Murray, Rebecca Isaacs, and Michael Isard. +Differential dataflow. +In \textit{Proceedings of CIDR}, 2013. + +\bibitem{dbsp} +Mihai Budiu, Tej Chajed, Frank McSherry, Leonid Ryzhyk, and Val Tannen. +DBSP: Automatic incremental view maintenance. +In \textit{Proceedings of VLDB}, 16(7):1601--1614, 2023. + +\bibitem{rx} +Erik Meijer. +Your mouse is a database. +\textit{Communications of the ACM}, 55(5):66--73, 2012. + +\bibitem{selfadjusting} +Umut A.\ Acar, Guy E.\ Blelloch, and Robert Harper. +Adaptive functional programming. +In \textit{Proceedings of POPL}, pages 247--259, 2002. + +\bibitem{viewmaintenance} +Ashish Gupta and Inderpal Singh Mumick. +Maintenance of materialized views: Problems, techniques, and applications. +\textit{IEEE Data Engineering Bulletin}, 18(2):3--18, 1995. + +\bibitem{slidingwindow} +Kanat Tangwongsan, Martin Hirzel, Scott Schneider, and Kun-Lung Wu. +General incremental sliding-window aggregation. +\textit{Proceedings of the VLDB Endowment}, 8(7):702--713, 2015. + +\bibitem{frp} +Conal Elliott and Paul Hudak. +Functional reactive animation. +In \textit{Proceedings of ICFP}, pages 263--273, 1997. + +\bibitem{incgraph} +Shufeng Yin, Huanchen Zhang, Zhengyi Yang, Wentao Han, Wenguang Chen, and Yingxia Shao. +GraphBolt: Dependency-driven synchronous processing of streaming graphs. +In \textit{Proceedings of ICDE}, 2022. + +\bibitem{trill} +Badrish Chandramouli, Jonathan Goldstein, Ryan Hill, Mirek Interlandi, Vladimir V.\ Lychagin, Morgan May, and Ravishankar Ramamurthy. +Trill: A high-performance incremental query processor for diverse analytics. +\textit{Proceedings of the VLDB Endowment}, 8(4):401--412, 2014. + +\bibitem{materialize} +Materialize, Inc. +Materialize: Streaming SQL database. +\url{https://materialize.com}, 2025. + +\bibitem{zedlewski2025eventhidden} +Charles Zedlewski. +Event-hidden architectures. +SkipLabs Blog, \url{https://skiplabs.io/blog/event-hidden-arch}, April 2025. + +\end{thebibliography} + +\end{document}