| // Copyright 2018 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_ |
| #define UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_ |
| |
| #include <string> |
| #include <unordered_map> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/macros.h" |
| #include "third_party/cld_3/src/src/nnet_language_identifier.h" |
| #include "ui/accessibility/ax_enums.mojom.h" |
| #include "ui/accessibility/ax_export.h" |
| |
| namespace ui { |
| |
| class AXNode; |
| class AXTree; |
| |
| // This module implements language detection enabling Chrome to automatically |
| // detect the language for runs of text within the page. |
| // |
| // Node-level language detection runs once per page after the load complete |
| // event. This involves two passes: |
| // *Detect* walks the tree from the given root using cld3 to detect up to 3 |
| // potential languages per node. A ranked list is created enumerating |
| // all potential languages on a page. |
| // *Label* re-walks the tree, assigning a language to each node considering |
| // the potential languages from the detect phase, page level |
| // statistics, and the assigned languages of ancestor nodes. |
| // |
| // Optionally an embedder may run *sub-node* language detection which attempts |
| // to assign languages for runs of text within a node, potentially down to the |
| // individual character level. This is useful in cases where a single paragraph |
| // involves switching between multiple languages, and where the speech engine |
| // doesn't automatically switch voices to handle different character sets. |
| // Due to the potentially small lengths of text runs involved this tends to be |
| // lower in accuracy, and works best when a node is composed of multiple |
| // languages with easily distinguishable scripts. |
| |
| // AXLanguageInfo represents the local language detection data for all text |
| // within an AXNode. Stored on AXNode. |
| struct AX_EXPORT AXLanguageInfo { |
| AXLanguageInfo(); |
| ~AXLanguageInfo(); |
| |
| // This is the final language we have assigned for this node during the |
| // 'label' step, it is the result of merging: |
| // a) The detected language for this node |
| // b) The declared lang attribute on this node |
| // c) the (recursive) language of the parent (detected or declared). |
| // |
| // This will be the empty string if no language was assigned during label |
| // phase. |
| // |
| // IETF BCP 47 Language code (rfc5646). |
| // examples: |
| // 'de' |
| // 'de-DE' |
| // 'en' |
| // 'en-US' |
| // 'es-ES' |
| // |
| // This should not be read directly by clients of AXNode, instead clients |
| // should call AXNode::GetLanguage(). |
| std::string language; |
| |
| // Detected languages for this node sorted as returned by |
| // FindTopNMostFreqLangs, which sorts in decreasing order of probability, |
| // filtered to remove any unreliable results. |
| std::vector<std::string> detected_languages; |
| }; |
| |
| // Each AXLanguageSpan contains a language, a probability, and start and end |
| // indices. The indices are used to specify the substring that contains the |
| // associated language. The string which the indices are relative to is not |
| // included in this structure. |
| // Also, the indices are relative to a Utf8 string. |
| // See documentation on GetLanguageAnnotationForStringAttribute for details |
| // on how to associate this object with a string. |
| struct AX_EXPORT AXLanguageSpan { |
| int start_index; |
| int end_index; |
| std::string language; |
| float probability; |
| }; |
| |
| // A single AXLanguageInfoStats instance is stored on each AXTree and contains |
| // statistics on detected languages for all the AXNodes in that tree. |
| // |
| // We rely on these tree-level statistics when labelling individual nodes, to |
| // provide extra signals to increase our confidence in assigning a detected |
| // language. |
| // |
| // The Label step will only assign a detected language to a node if that |
| // language is one of the most frequent languages on the page. |
| // |
| // For example, if a single node has detected_languages (in order of probability |
| // assigned by cld_3): da-DK, en-AU, fr-FR, but the page statistics overall |
| // indicate that the page is generally in en-AU and ja-JP, it is more likely to |
| // be a mis-recognition of Danish than an accurate assignment, so we assign |
| // en-AU instead of da-DK. |
| class AX_EXPORT AXLanguageInfoStats { |
| public: |
| AXLanguageInfoStats(); |
| ~AXLanguageInfoStats(); |
| |
| // Adjust our statistics to add provided detected languages. |
| void Add(const std::vector<std::string>& languages); |
| |
| // Fetch the score for a given language. |
| int GetScore(const std::string& lang) const; |
| |
| // Check if a given language is within the top results. |
| bool CheckLanguageWithinTop(const std::string& lang); |
| |
| private: |
| // Store a count of the occurrences of a given language. |
| std::unordered_map<std::string, unsigned int> lang_counts_; |
| |
| // Cache of last calculated top language results. |
| // A vector of pairs of (score, language) sorted by descending score. |
| std::vector<std::pair<unsigned int, std::string>> top_results_; |
| // Boolean recording that we have not mutated the statistics since last |
| // calculating top results, setting this to false will cause recalculation |
| // when the results are next fetched. |
| bool top_results_valid_; |
| |
| void InvalidateTopResults(); |
| |
| void GenerateTopResults(); |
| |
| DISALLOW_COPY_AND_ASSIGN(AXLanguageInfoStats); |
| }; |
| |
| // AXLanguageDetectionManager manages all of the context needed for language |
| // detection within an AXTree. |
| class AX_EXPORT AXLanguageDetectionManager { |
| public: |
| AXLanguageDetectionManager(); |
| ~AXLanguageDetectionManager(); |
| |
| // Detect language for each node in the subtree rooted at the given node. |
| // This is the first pass in detection and labelling. |
| // This only detects the language, it does not label it, for that see |
| // LabelLanguageForSubtree. |
| void DetectLanguageForSubtree(AXNode* subtree_root); |
| |
| // Label language for each node in the subtree rooted at the given node. |
| // This is the second pass in detection and labelling. |
| // This will label the language, but relies on the earlier detection phase |
| // having already completed. |
| void LabelLanguageForSubtree(AXNode* subtree_root); |
| |
| // Sub-node language detection for a given string attribute. |
| // For example, if a node has name: "My name is Fred", then calling |
| // GetLanguageAnnotationForStringAttribute(*node, ax::mojom::StringAttribute:: |
| // kName) would return language detection information about "My name is Fred". |
| std::vector<AXLanguageSpan> GetLanguageAnnotationForStringAttribute( |
| const AXNode& node, |
| ax::mojom::StringAttribute attr); |
| |
| private: |
| // TODO(chrishall): should this be stored by pointer or value? |
| AXLanguageInfoStats lang_info_stats; |
| |
| void DetectLanguageForSubtreeInternal(AXNode* subtree_root); |
| void LabelLanguageForSubtreeInternal(AXNode* subtree_root); |
| |
| // This language identifier is constructed with a default minimum byte length |
| // of chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider and is |
| // used for detecting page-level languages. |
| chrome_lang_id::NNetLanguageIdentifier language_identifier_; |
| |
| // This language identifier is constructed with a minimum byte length of |
| // kShortTextIdentifierMinByteLength so it can be used for detecting languages |
| // of shorter text (e.g. one character). |
| chrome_lang_id::NNetLanguageIdentifier short_text_language_identifier_; |
| |
| DISALLOW_COPY_AND_ASSIGN(AXLanguageDetectionManager); |
| }; |
| |
| } // namespace ui |
| |
| #endif // UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_ |