{
  "patent_number": "US 11170293",
  "country": "US",
  "title": "How AI Systems Learn to Predict and Act Simultaneously",
  "original_title": "Multi-model controller",
  "summary": "A method for training AI models that combines supervised learning for prediction with reinforcement learning for decision-making in a single, coordinated system.",
  "what_it_does": "This patent describes a system that uses two types of neural networks working in tandem to improve how an AI makes decisions. First, a Recurrent Neural Network (RNN) processes data to create a 'state' (a summary of what is happening) and predicts a result. Second, a Q-Network (QN) looks at that state to suggest the best action to take. The system then uses a 'reference result' (the actual outcome) to update the RNN using supervised learning, while simultaneously updating the Q-Network using reinforcement learning based on the action taken and the new state. This allows the system to get better at both understanding its environment and choosing the right actions over time.",
  "what_it_does_not_cover": [
    "Does not cover systems that rely solely on supervised learning without a reinforcement learning feedback loop.",
    "Does not cover static models that do not update their parameters (the 'second' model versions) based on new observation values.",
    "Does not cover non-recurrent neural networks that lack the ability to maintain state information over time.",
    "Does not cover systems that do not use a communications interface to receive external reference result values."
  ],
  "filed": "2015-12-30",
  "granted": "2021-11-09",
  "expires": null,
  "status": "active",
  "holder": "Microsoft Technology Licensing LLC",
  "holder_url": "https://patentbrief.org/company/microsoft-technology-licensing-llc",
  "inventors": [
    {
      "name": "Ji He",
      "url": "https://patentbrief.org/inventor/ji-he"
    },
    {
      "name": "Prabhdeep Singh",
      "url": "https://patentbrief.org/inventor/prabhdeep-singh"
    },
    {
      "name": "Lihong Li",
      "url": "https://patentbrief.org/inventor/lihong-li"
    },
    {
      "name": "Xiaodong He",
      "url": "https://patentbrief.org/inventor/xiaodong-he"
    },
    {
      "name": "Jianfeng Gao",
      "url": "https://patentbrief.org/inventor/jianfeng-gao"
    },
    {
      "name": "Jianshu Chen",
      "url": "https://patentbrief.org/inventor/jianshu-chen"
    },
    {
      "name": "Li Deng",
      "url": "https://patentbrief.org/inventor/li-deng"
    },
    {
      "name": "Xiujun Li",
      "url": "https://patentbrief.org/inventor/xiujun-li"
    }
  ],
  "times_cited": 4,
  "tags": [
    "ai_ml",
    "software",
    "mechanical",
    "telecommunications"
  ],
  "abstract": "A processing unit can operate a first recurrent computational model (RCM) to provide first state information and a predicted result value. The processing unit can operating a first network computational model (NCM) to provide respective expectation values of a plurality of actions based at least in part on the first state information. The processing unit can provide an indication of at least one of the plurality of actions, and receive a reference result value, e.g., via a communications interface. The processing unit can train the first RCM based at least in part on the predicted result value and the reference result value to provide a second RCM, and can train the first NCM based at least in part on the first state information and the at least one of the plurality of actions to provide a second NCM.",
  "url": "https://patentbrief.org/patent/us/11170293/alphago-policy-and-value-networks",
  "markdown_url": "https://patentbrief.org/patent/us/11170293/alphago-policy-and-value-networks/md",
  "google_patents_url": "https://patents.google.com/patent/US11170293",
  "relatedPatents": []
}