dougal · novalis · Feb 12, 2012 · Feb 12, 2012 · Feb 12, 2012 · Feb 12, 2012
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ coverage
 index
 *.gem
 .bundle
+*~
diff --git a/README.rdoc b/README.rdoc
@@ -170,6 +170,11 @@ acts_as_indexed plugin folder. Then point your web browser at
 Alternatively, you can view the rdoc documentation
 online[http://rdoc.info/projects/dougal/acts_as_indexed/].
 
+== Stemming
+
+This branch has basic support for stemming using the Porter stemmer.
+Quoted portions of queries will be searched exactly (without
+stemming).
 
 == Problems, Comments, Suggestions?
 
@@ -189,7 +194,7 @@ All of the above are most welcome. mailto:dougal.s@gmail.com
 * Ben Anderson
 * Theron Toomey
 * Uģis Ozols
-
+* David Turner
 
 == Future Releases
 

diff --git a/lib/acts_as_indexed/configuration.rb b/lib/acts_as_indexed/configuration.rb
@@ -20,6 +20,10 @@ class Configuration
     # Default is 3.
     attr_reader :min_word_size
 
+    # The regular expression which defines how words are separated;
+    # words are also separated on word-nonword boundaries
+    attr_reader :space_regexp
+
     # Proc that allows you to turn on or off index for a record.
     # Useful if you don't want an object to be placed in the index, such as a
     # draft post.
@@ -42,6 +46,7 @@ def initialize
       @if_proc          = if_proc
       @case_sensitive   = false
       @disable_auto_indexing = false
+      @space_regexp = /[\s\u00a0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]+/
     end
 
     # Since we cannot expect Rails to be available on load, it is best to put

diff --git a/lib/acts_as_indexed/search_atom.rb b/lib/acts_as_indexed/search_atom.rb
@@ -6,8 +6,11 @@
 module ActsAsIndexed #:nodoc:
   class SearchAtom
 
-    # Contains a hash of records.
-    # { 'record_id' => [pos1, pos2, pos] }
+    # Contains a hash keyed by words having a commmon stem
+    # each value is another hash from record ids to word numbers
+    # { 'foo' => {record_id => [pos1, pos2, ...], }
+    #   'fooing' => {record_id => [pos3, pos4, ...] }
+    # }
     #--
     # Weighting:
     # http://www.perlmonks.com/index.pl?node_id=27509
@@ -22,68 +25,114 @@ def initialize(records={})
 
     # Returns true if the given record is present.
     def include_record?(record_id)
-      @records.include?(record_id)
+      @records.values.any? {|record| record.member?(record_id)}
     end
 
-    # Adds +record_id+ to the stored records.
-    def add_record(record_id)
-      @records[record_id] = [] unless include_record?(record_id)
+    def include_token?(token)
+      return @records.member? token
     end
 
-    # Adds +pos+ to the array of positions for +record_id+.
-    def add_position(record_id, pos)
-      add_record(record_id)
-      @records[record_id] << pos
+    # Adds +token+ to the stored tokens.
+    def add_token(token)
+      @records[token] = {} unless @records[token]
+    end
+
+    def add_record_token(token, record_id)
+      add_token(token)
+      @records[token][record_id] ||= []
+    end
+
+    # Adds +pos+ to the array of positions for +token+ and +record_id+.
+    def add_position(record_id, token, pos)
+      add_record_token(token, record_id)
+      @records[token][record_id] << pos
+    end
+
+    # This returns record->[positions], where positions is
+    # all the positions across all tokens
+    def flat_records
+      flat = {}
+      @records.each do |token, records|
+        records.each do |record, positions|
+          flat[record] ||= []
+          flat[record] += positions
+        end
+      end
+      flat
     end
 
     # Returns all record IDs stored in this Atom.
     def record_ids
-      @records.keys
+      @records.values.map{|h| h.keys}.inject '+'
     end
 
     # Returns an array of positions for +record_id+ stored in this Atom.
-    def positions(record_id)
-      @records[record_id]
+    def all_positions(record_id)
+      @records.values.map {|h| h[record_id]}.inject '+'
+    end
+
+    # Returns an hash of record->array of positions for +token+ stored
+    # in this Atom.
+    def records_by_token(token)
+      @records[token]
     end
 
     # Removes +record_id+ from this Atom.
     def remove_record(record_id)
-      @records.delete(record_id)
+      @records.values.each{|v| v.delete(record_id)}
     end
 
     # Creates a new SearchAtom with the combined records from self and other
     def +(other)
-      SearchAtom.new(@records.clone.merge!(other.records) { |key, _old, _new|
-                                                            _old + _new
-                                                          })
+      SearchAtom.new(@records.clone.merge!(other.records) {
+                       |key, _old, _new|
+                       _old.merge(_new) {
+                         |k, o, n|
+                         o + n
+                       }
+                     })
     end
 
+    def exact(token)
+      SearchAtom.new(Hash[*@records.find_all {|k, v| k == token }.flatten])
+    end
+
+
     # Creates a new SearchAtom with records in other removed from self.
     def -(other)
-      records = @records.clone.reject { |name, records| other.records.include?(name) }
+      records = {}
+      @records.each { |token, records_for_token|
+        if other.records.include? (token)
+          other_token_records = other.records[token]
+          new_records = records_for_token.reject {|id, records| other_token_records.include?(id) }
+          if new_records.size
+            records[token] = new_records
+          end
+        end
+      }
       SearchAtom.new(records)
     end
 
-    # Returns at atom containing the records and positions of +self+ preceded by +former+
-    # "former latter" or "big dog" where "big" is the former and "dog" is the latter.
+    # Returns an atom containing the records and positions of +self+
+    # preceded by +former+ "former latter" or "big dog" where "big" is
+    # the former and "dog" is the latter.
+
     def preceded_by(former)
       matches = SearchAtom.new
-      latter = {}
-      former.record_ids.each do |rid|
-        latter[rid] = @records[rid] if @records[rid]
-      end
-      # Iterate over each record in latter.
-      latter.each do |record_id,pos|
-
-        # Iterate over each position.
-        pos.each do |p|
-          # Check if previous position is in former.
-          if former.include_position?(record_id,p-1)
-            matches.add_record(record_id) unless matches.include_record?(record_id)
-            matches.add_position(record_id,p)
+
+      for former_token, former_records in former.records
+        for latter_token, latter_records in @records
+          for latter_record, latter_positions in latter_records
+            next unless former_records.member? latter_record
+
+            #this record appears in both
+            for former_pos in former_records[latter_record]
+              if latter_positions.member? former_pos + 1
+                matches.add_position(latter_record, latter_token, former_pos + 1)
+              end
+            end
           end
         end
-
       end
       matches
     end
@@ -92,7 +141,8 @@ def preceded_by(former)
     # atom.
     def weightings(records_size)
       out = {}
-      @records.each do |r_id, pos|
+      flat = flat_records
+      flat.each do |r_id, pos|
 
         # Fixes a bug when the records_size is zero. i.e. The only record
         # contaning the word has been deleted.
@@ -104,15 +154,19 @@ def weightings(records_size)
         # weighting = frequency * log (records.size / records_with_atom)
         ## parndt 2010/05/03 changed to records_size.to_f to avoid -Infinity Errno::ERANGE exceptions
         ## which would happen for example Math.log(1 / 20) == -Infinity but Math.log(1.0 / 20) == -2.99573227355399
-        out[r_id] = pos.size * Math.log(records_size.to_f / @records.size)
+        out[r_id] = pos.size * Math.log(records_size.to_f / flat_records.size)
       end
       out
     end
 
     protected
 
     def include_position?(record_id,pos)
-      @records[record_id].include?(pos)
+      @records.any? {|record|
+        if record.include? record_id
+          record[record_id].include?(pos)
+        end
+      }
     end
 
   end
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ coverage @@
     index
     *.gem
     .bundle
+    *~