Brian Beckman shows off a simple hash join in LINQ, using VB9. The performace boosts reinforce some work my partner and I did, experimenting with load-time generated indexes over collections.
There was a significant hit incurred when creating the index, but it was assumed that you wouldn't bother unless it actually did any good.
The code below is somewhat naive, but shows one approach. In this case, the index tracks several different features of a number, even/odd, positive/negative, and the values modulo 10 and modulo 3. The query is then over the index partitions, rather than the raw collection, and returns a new index containing just the partitions that matched. Enumerating over an index yields the indexed items.
On my system, I get times similar to these. The qa* are queries over a List<double>, while qb* are queries over an Indexable<double, NumberIndex>. The total range of doubles went from -1e7 to +1e7.
Time to create index = 21943ms qa1: Count = 1000000; Time = 2131ms qa2: Count = 9999999; Time = 1470ms qa3: Count = 10000000; Time = 2396ms qa4: Count = 333333; Time = 1965ms qb1: Count = 1000000; Time = 87ms qb2: Count = 9999999; Time = 616ms qb3: Count = 10000000; Time = 812ms qb4: Count = 333333; Time = 22ms
Code follows...
using System; using System.Collections.Generic; using System.Text; using System.Query; using System.Xml.XLinq; using System.Data.DLinq; namespace IndexLinq { public class Indexable<TItem, TKey> : IEnumerable<TItem> { internal Dictionary<TKey, List<TItem>> _index = new Dictionary<TKey, List<TItem>>(); internal Func<TItem, TKey> _indexer; private int? _capacity = null; public Indexable(Func<TItem, TKey> indexer) { this._indexer = indexer; } public void AddRange(IEnumerable<TItem> source) { if (this._indexer != null) { Refresh(source); } } public void AddRange(IEnumerable<TItem> source, int capacity) { this._capacity = capacity; AddRange(source); } public void AddRange(IList<TItem> source) { AddRange(source, source.Count / 10); } public void Clear() { this._index.Clear(); } private void Refresh(IEnumerable<TItem> items) { if (items == null || _index == null) { throw new InvalidOperationException("Unable to refresh when indexable does not have both a source and an indexer defined."); } foreach (TItem item in items) { TKey key = _indexer(item); if (!_index.ContainsKey(key)) { if (this._capacity.HasValue) { _index.Add(key, new List<TItem>(this._capacity.Value)); } else { _index.Add(key, new List<TItem>()); } Console.WriteLine("Added new key: {0}", key); } _index[key].Add(item); } } #region IEnumerable<TItem> Members public IEnumerator<TItem> GetEnumerator() { foreach (List<TItem> list in _index.Values) { foreach (TItem item in list) { yield return item; } } } #endregion #region IEnumerable Members System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() { return this.GetEnumerator(); } #endregion } }
using System; using System.Collections.Generic; using System.Text; namespace IndexLinq { public static class IndexLinqExtensions { public static Indexable<TItem, TKey> Where<TItem, TKey>(this Indexable<TItem, TKey> indexable, Predicate<TKey> predicate) { Indexable<TItem, TKey> result = new Indexable<TItem, TKey>(indexable._indexer); foreach (TKey key in indexable._index.Keys) { if (predicate(key)) { result._index.Add(key, indexable._index[key]); } } return result; } } }
using System; using System.Collections.Generic; using System.Text; using System.Query; using System.Xml.XLinq; using System.Data.DLinq; using IndexLinq; using System.Diagnostics; namespace IndexLinqTest { class Program { static void Main(string[] args) { var numbers = new List<double>(Numbers(-1e7, 1e7)); Stopwatch stopwatch = new Stopwatch(); stopwatch.Reset(); stopwatch.Start(); var indexed = new Indexable(new Func(Indexer)); indexed.AddRange(numbers); stopwatch.Stop(); Console.WriteLine("Time to create index = {0}ms", stopwatch.ElapsedMilliseconds); // non-indexed var qa1 = from i in numbers where i % 10 == 1 select i; var qa2 = from i in numbers where i > 0 select i; var qa3 = from i in numbers where i % 2 == 0 select i; var qa4 = from i in numbers where i % 10 == 1 && i % 3 == 0 select i; // indexed var qb1 = from i in indexed where i.Mod10 == 1 select i; var qb2 = from i in indexed where i.IsPositive == true select i; var qb3 = from i in indexed where i.IsEven == true select i; var qb4 = from i in indexed where i.Mod10 == 1 && i.Mod3 == 0 select i; var queries = new[] { new object[]{ qa1, "qa1" }, new object[]{ qa2, "qa2" }, new object[]{ qa3, "qa3" }, new object[]{ qa4, "qa4" }, new object[]{ qb1, "qb1" }, new object[]{ qb2, "qb2" }, new object[]{ qb3, "qb3" }, new object[]{ qb4, "qb4" } }; int count; foreach (object[] query in queries) { count = 0; stopwatch.Reset(); stopwatch.Start(); foreach (var i in query[0] as IEnumerable<double>) { count++; } stopwatch.Stop(); Console.WriteLine("{0}: Count = {1}; Time = {2}ms", query[1], count, stopwatch.ElapsedMilliseconds ); } Console.ReadKey(true); } public struct NumberIndex { public bool IsEven; public bool IsPositive; public double Mod10; public double Mod3; public override bool Equals(object obj) { if (!(obj is NumberIndex)) return false; NumberIndex numberIndex = (NumberIndex) obj; return this.IsEven == numberIndex.IsEven && this.IsPositive == numberIndex.IsPositive && this.Mod10 == numberIndex.Mod10 && this.Mod3 == numberIndex.Mod3; } public override int GetHashCode() { return (int) (this.Mod10 * 1000 + (this.Mod3 * 100) + (this.IsEven ? 10 : 0) + (this.IsPositive ? 1 : 0)); } public override string ToString() { return String.Format(@"[Mod10 = {0}; Mod3 = {3}; IsEven = {1}; IsPositive = {2}]", this.Mod10, this.IsEven, this.IsPositive, this.Mod3); } } static NumberIndex Indexer(double i) { NumberIndex result = new NumberIndex(); result.IsEven = (i % 2 == 0); result.IsPositive = (i > 0); result.Mod10 = (i % 10); result.Mod3 = (i % 3); return result; } static IEnumerable<double> Numbers(double start, double end) { if (start > end) { double x = end; end = start; start = x; } for (double i = start; i < end; i++) { yield return i; } } } }