opensextant.extractors.poli

These "Patterns of Life" are strictly examples to illustrate more general reg-ex patterns (that is more general than the coordinate and date/time patterns).

The main objective here is to show how beyond basic regex matches, we can add important business logic to the entity extraction process.

 1"""
 2These "Patterns of Life" are strictly examples to illustrate more general reg-ex patterns
 3 (that is more general than the coordinate and date/time patterns).
 4
 5 The main objective here is to show how beyond basic regex matches, we can add important business logic to the
 6 entity extraction process.
 7
 8"""
 9
10from opensextant.FlexPat import PatternMatch, RegexPatternManager
11
12
13class TelephoneNumber(PatternMatch):
14    def __init__(self, *args, **kwargs):
15        PatternMatch.__init__(self, *args, **kwargs)
16        self.case = PatternMatch.UPPER_CASE
17
18    def normalize(self):
19        PatternMatch.normalize(self)
20        print("TBD - normalize phone")
21
22
23class MACAddress(PatternMatch):
24    def __init__(self, *args, **kwargs):
25        PatternMatch.__init__(self, *args, **kwargs)
26        self.case = PatternMatch.UPPER_CASE
27
28    def normalize(self):
29        PatternMatch.normalize(self)
30        print("TBD - normalize MAC address")
31
32
33class Money(PatternMatch):
34    def __init__(self, *args, **kwargs):
35        PatternMatch.__init__(self, *args, **kwargs)
36        self.case = PatternMatch.LOWER_CASE
37
38    def normalize(self):
39        PatternMatch.normalize(self)
40        print("TBD - normalize Money")
41
42
43class PatternsOfLifeManager(RegexPatternManager):
44    #
45    # Demonstration.  PatternsOfLifeManager is a custom RegexPatternManager
46    # that shows how to apply FlexPat to extracting common things like currency amounts, MAC addresses
47    # and telephone numbers.
48    #
49    def __init__(self, cfg):
50        """
51        Call as
52            mgr = PatternsOfLifeManager("poli_patterns.cfg")
53            patternsApp = PatternExtractor( mgr )
54
55            test_results = patternsApp.default_tests()
56            real_results = patternsApp.extract( ".... text blob..." )
57
58        :param cfg: patterns config file.
59        """
60        RegexPatternManager.__init__(self, cfg, debug=True, testing=True)
61
62    def _initialize(self):
63        #
64        # This Class registry maps the existing Java classes (in the config file) to Python variations here.
65        self.match_class_registry = {
66            "org.opensextant.extractors.poli.data.TelephoneNumber":
67                "opensextant.extractors.poli.TelephoneNumber",
68
69            "org.opensextant.extractors.poli.data.MACAddress":
70                "opensextant.extractors.poli.MACAddress",
71
72            "org.opensextant.extractors.poli.data.Money":
73                "opensextant.extractors.poli.Money",
74
75            "org.opensextant.extractors.poli.data.EmailAddress":
76                "opensextant.extractors.poli.EmailAddress"
77        }
78        RegexPatternManager._initialize(self)
class TelephoneNumber(opensextant.FlexPat.PatternMatch):
14class TelephoneNumber(PatternMatch):
15    def __init__(self, *args, **kwargs):
16        PatternMatch.__init__(self, *args, **kwargs)
17        self.case = PatternMatch.UPPER_CASE
18
19    def normalize(self):
20        PatternMatch.normalize(self)
21        print("TBD - normalize phone")

A general Pattern-based TextMatch. This Python variation consolidates PoliMatch (patterns-of-life = poli) ideas in the Java API.

def normalize(self):
19    def normalize(self):
20        PatternMatch.normalize(self)
21        print("TBD - normalize phone")

Optional, but recommended routine to normalize the matched data. That is, parse fields, uppercase, streamline punctuation, etc. As well, given such normalization result, this is the opportunity to additionally validate the match. :return:

class MACAddress(opensextant.FlexPat.PatternMatch):
24class MACAddress(PatternMatch):
25    def __init__(self, *args, **kwargs):
26        PatternMatch.__init__(self, *args, **kwargs)
27        self.case = PatternMatch.UPPER_CASE
28
29    def normalize(self):
30        PatternMatch.normalize(self)
31        print("TBD - normalize MAC address")

A general Pattern-based TextMatch. This Python variation consolidates PoliMatch (patterns-of-life = poli) ideas in the Java API.

def normalize(self):
29    def normalize(self):
30        PatternMatch.normalize(self)
31        print("TBD - normalize MAC address")

Optional, but recommended routine to normalize the matched data. That is, parse fields, uppercase, streamline punctuation, etc. As well, given such normalization result, this is the opportunity to additionally validate the match. :return:

class Money(opensextant.FlexPat.PatternMatch):
34class Money(PatternMatch):
35    def __init__(self, *args, **kwargs):
36        PatternMatch.__init__(self, *args, **kwargs)
37        self.case = PatternMatch.LOWER_CASE
38
39    def normalize(self):
40        PatternMatch.normalize(self)
41        print("TBD - normalize Money")

A general Pattern-based TextMatch. This Python variation consolidates PoliMatch (patterns-of-life = poli) ideas in the Java API.

def normalize(self):
39    def normalize(self):
40        PatternMatch.normalize(self)
41        print("TBD - normalize Money")

Optional, but recommended routine to normalize the matched data. That is, parse fields, uppercase, streamline punctuation, etc. As well, given such normalization result, this is the opportunity to additionally validate the match. :return:

class PatternsOfLifeManager(opensextant.FlexPat.RegexPatternManager):
44class PatternsOfLifeManager(RegexPatternManager):
45    #
46    # Demonstration.  PatternsOfLifeManager is a custom RegexPatternManager
47    # that shows how to apply FlexPat to extracting common things like currency amounts, MAC addresses
48    # and telephone numbers.
49    #
50    def __init__(self, cfg):
51        """
52        Call as
53            mgr = PatternsOfLifeManager("poli_patterns.cfg")
54            patternsApp = PatternExtractor( mgr )
55
56            test_results = patternsApp.default_tests()
57            real_results = patternsApp.extract( ".... text blob..." )
58
59        :param cfg: patterns config file.
60        """
61        RegexPatternManager.__init__(self, cfg, debug=True, testing=True)
62
63    def _initialize(self):
64        #
65        # This Class registry maps the existing Java classes (in the config file) to Python variations here.
66        self.match_class_registry = {
67            "org.opensextant.extractors.poli.data.TelephoneNumber":
68                "opensextant.extractors.poli.TelephoneNumber",
69
70            "org.opensextant.extractors.poli.data.MACAddress":
71                "opensextant.extractors.poli.MACAddress",
72
73            "org.opensextant.extractors.poli.data.Money":
74                "opensextant.extractors.poli.Money",
75
76            "org.opensextant.extractors.poli.data.EmailAddress":
77                "opensextant.extractors.poli.EmailAddress"
78        }
79        RegexPatternManager._initialize(self)

RegexPatternManager is the patterns configuration file parser. See documentation: https://opensextant.github.io/Xponents/doc/Patterns.md

PatternsOfLifeManager(cfg)
50    def __init__(self, cfg):
51        """
52        Call as
53            mgr = PatternsOfLifeManager("poli_patterns.cfg")
54            patternsApp = PatternExtractor( mgr )
55
56            test_results = patternsApp.default_tests()
57            real_results = patternsApp.extract( ".... text blob..." )
58
59        :param cfg: patterns config file.
60        """
61        RegexPatternManager.__init__(self, cfg, debug=True, testing=True)

Call as mgr = PatternsOfLifeManager("poli_patterns.cfg") patternsApp = PatternExtractor( mgr )

test_results = patternsApp.default_tests()
real_results = patternsApp.extract( ".... text blob..." )

:param cfg: patterns config file.