{
  "patent_number": "US 10878335",
  "country": "US",
  "title": "How Computers Find Similar Text Using Compact Data Structures",
  "original_title": "Scalable text analysis using probabilistic data structures",
  "summary": "This patent describes a method for efficiently identifying similar text records, like documents or product reviews, by using special compact data structures that store text terms probabilistically and then analyzing them with machine learning.",
  "what_it_does": "This system (Claim 1) takes a piece of text, such as a product review, and uses a \"hashing-based function\" to map its words (e.g., \"excellent\") to specific spots in a \"probabilistic data structure.\" This data structure acts like a compact, fuzzy summary of many other text records. When a word is mapped, the system updates an entry in this structure to indicate the word's presence. Importantly, these entries can represent multiple words (Claim 1), making the structure very efficient. After updating, the system applies a \"dimensionality reduction algorithm\" to simplify the data, then feeds this into a \"similarity detection algorithm\" to figure out how much the new text is like other texts it has seen. For example, it could find customer reviews that discuss similar product features.",
  "what_it_does_not_cover": [
    "Does not cover systems that store every single word explicitly in a traditional database for similarity comparison, as it relies on probabilistic storage where entries can represent more than one text term.",
    "Does not cover similarity detection that doesn't use a probabilistic data structure as the initial input for further analysis.",
    "Does not cover text analysis methods that do not involve applying a hashing-based function to text terms to update the data structure.",
    "Does not cover systems that omit the step of applying a dimensionality reduction algorithm on the probabilistic data structure before generating similarity indications.",
    "Does not cover combining data structures without using bit-level Boolean operations or vector instructions, as specified in Claim 3."
  ],
  "filed": "2016-06-14",
  "granted": "2020-12-29",
  "expires": null,
  "status": "active",
  "holder": "Amazon Technologies Inc",
  "holder_url": "https://patentbrief.org/company/amazon-technologies-inc",
  "inventors": [
    {
      "name": "Robert Mark Waugh",
      "url": "https://patentbrief.org/inventor/robert-mark-waugh"
    }
  ],
  "times_cited": 18,
  "tags": [
    "software",
    "ai_ml",
    "ecommerce",
    "telecommunications",
    "consumer_electronics"
  ],
  "abstract": "A mapping function is used to identify one or more entries of a probabilistic data structure to be updated to indicate a presence of a particular term in a text record. Some entries of the data structure may correspond to more than one term. The data structure is used as input for a machine learning algorithm which provides an indication of similarity between the text record and other text records.",
  "url": "https://patentbrief.org/patent/us/10878335/bert-bidirectional-encoder-representations",
  "markdown_url": "https://patentbrief.org/patent/us/10878335/bert-bidirectional-encoder-representations/md",
  "google_patents_url": "https://patents.google.com/patent/US10878335",
  "relatedPatents": []
}