SystemT: an Algebraic Approach to Declarative Information … · 2017. 8. 12. · Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Proin, in sagittis facilisis, volutpat
This document is posted to help you gain knowledge. Please leave a comment to let me know what you think about it! Share it to your friends and learn new things together.
---------------------------------------create view ValidLastNameAll asselect N.lastname as lastnamefrom LastNameAll N-- do not allow partially all capitalized wordswhere Not(MatchesRegex(/(\p{Lu}\p{M}*)+-.*([\p{Ll}\p{Lo}]\p{M}*).*/, N.lastname))and Not(MatchesRegex(/.*([\p{Ll}\p{Lo}]\p{M}*).*-
(\p{Lu}\p{M}*)+/, N.lastname));
create view LastName asselect C.lastname as lastname--from Consolidate(ValidLastNameAll.lastname) C;from ValidLastNameAll Cconsolidate on C.lastname;
-- Find dictionary matches for all first names-- Mostly US first namescreate view StrictFirstName1 asselect D.match as firstnamefrom Dictionary('strictFirst.dict', Doc.text) D--where MatchesRegex(/\p{Upper}\p{Lower}[\p{Alpha}]{0,20}/, D.match);-- changed to enable unicode matchwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
-- German first namescreate view StrictFirstName2 asselect D.match as firstnamefrom Dictionary('strictFirst_german.dict', Doc.text) D--where MatchesRegex(/\p{Upper}\p{Lower}[\p{Alpha}]{0,20}/, D.match);--where MatchesRegex(/\p{Upper}.{1,20}/, D.match);-- changed to enable unicode matchwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
-- nick names for US first namescreate view StrictFirstName3 asselect D.match as firstnamefrom Dictionary('strictNickName.dict', Doc.text) D--where MatchesRegex(/\p{Upper}\p{Lower}[\p{Alpha}]{0,20}/, D.match);--where MatchesRegex(/\p{Upper}.{1,20}/, D.match);-- changed to enable unicode matchwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
-- german first name from blue pagecreate view StrictFirstName4 asselect D.match as firstnamefrom Dictionary('strictFirst_german_bluePages.dict', Doc.text) D--where MatchesRegex(/\p{Upper}\p{Lower}[\p{Alpha}]{0,20}/, D.match);--where MatchesRegex(/\p{Upper}.{1,20}/, D.match);-- changed to enable unicode matchwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
-- Italy first name from blue pagescreate view StrictFirstName5 asselect D.match as firstnamefrom Dictionary('names/strictFirst_italy.dict', Doc.text) Dwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
-- France first name from blue pagescreate view StrictFirstName6 asselect D.match as firstnamefrom Dictionary('names/strictFirst_france.dict', Doc.text) Dwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
-- Spain first name from blue pagescreate view StrictFirstName7 asselect D.match as firstnamefrom Dictionary('names/strictFirst_spain.dict', Doc.text) Dwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
-- Indian first name from blue pages-- TODO: still need to clean up the remaining entriescreate view StrictFirstName8 asselect D.match as firstnamefrom Dictionary('names/strictFirst_india.partial.dict', Doc.text) Dwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
-- Israel first name from blue pagescreate view StrictFirstName9 asselect D.match as firstnamefrom Dictionary('names/strictFirst_israel.dict', Doc.text) Dwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
-- union all the dictionary matches for first namescreate view StrictFirstName as
(select S.firstname as firstname from StrictFirstName1 S)
union all(select S.firstname as firstname from
StrictFirstName2 S)union all(select S.firstname as firstname from
StrictFirstName3 S)union all(select S.firstname as firstname from
StrictFirstName4 S)union all(select S.firstname as firstname from
StrictFirstName5 S)union all(select S.firstname as firstname from
StrictFirstName6 S)union all(select S.firstname as firstname from
StrictFirstName7 S)union all(select S.firstname as firstname from
StrictFirstName8 S)union all(select S.firstname as firstname from
StrictFirstName9 S);
-- Relaxed versions of first namecreate view RelaxedFirstName1 asselect CombineSpans(S.firstname, CP.name) as firstnamefrom StrictFirstName S,
create view FirstName asselect C.firstname as firstname--from Consolidate(ValidFirstNameAll.firstname) C;from ValidFirstNameAll Cconsolidate on C.firstname;
-- Combine all dictionary matches for both last names and first namescreate view NameDict asselect D.match as namefrom Dictionary('name.dict', Doc.text) D--where MatchesRegex(/\p{Upper}\p{Lower}[\p{Alpha}]{0,20}/, D.match);--where MatchesRegex(/\p{Upper}.{1,20}/, D.match);-- changed to enable unicode matchwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
where FollowsTok(FN.firstname, IW.word, 0, 0)and FollowsTok(IW.word, CP.name, 0, 0);
/*** Translation for Rule 3r2* * This relaxed version of rule '3' will find person names like Thomas B.M. David* But it only insists that the second word is in the person dictionary*//*<rule annotation=Person id=3r2><internal><token attribute={etc}>CAPSPERSON</token><token attribute={etc}>INITIALWORD</token><token attribute={etc}PERSON:ST:LNAME{etc}>CAPSPERSON</token></internal></rule>*/
create view Person3r2 asselect CombineSpans(CP.name, LN.lastname) as personfrom LastName LN,
InitialWord IW,CapsPerson CP
where FollowsTok(CP.name, IW.word, 0, 0)and FollowsTok(IW.word, LN.lastname, 0, 0);
/*** Translation for Rule 4** This rule will find person names like David Thomas*//*<rule annotation=Person id=4><internal><token attribute={etc}PERSON:ST:FNAME{etc}>CAPSPERSON</token><token attribute={etc}PERSON:ST:LNAME{etc}>CAPSPERSON</token></internal></rule>*/create view Person4WithNewLine asselect CombineSpans(FN.firstname, LN.lastname) as personfrom FirstName FN,
-- Yunyao: 05/20/2008 revised to Person4WrongCandidates due to performance reason-- NOTE: current optimizer execute Equals first thus make Person4Wrong very expensive--create view Person4Wrong as--select CombineSpans(FN.firstname, LN.lastname) as person--from FirstName FN,-- LastName LN--where FollowsTok(FN.firstname, LN.lastname, 0, 0)-- and ContainsRegex(/[\n\r]/, SpanBetween(FN.firstname, LN.lastname))-- and Equals(GetText(FN.firstname), GetText(LN.lastname));
create view Person4WrongCandidates asselect FN.firstname as firstname, LN.lastname as lastnamefrom FirstName FN,
and ContainsRegex(/[\n\r]/, SpanBetween(FN.firstname, LN.lastname));
create view Person4 as(select P.person as person from
Person4WithNewLine P)minus(select CombineSpans(P.firstname, P.lastname) as
person from Person4WrongCandidates Pwhere Equals(GetText(P.firstname),
GetText(P.lastname)));/*** Translation for Rule4a* This rule will find person names like Thomas, David*//*<rule annotation=Person id=4a><internal><token attribute={etc}PERSON:ST:LNAME{etc}>CAPSPERSON</token><token attribute={etc}>\,</token><token attribute={etc}PERSON:ST:FNAME{etc}>CAPSPERSON</token></internal></rule>*/create view Person4a asselect CombineSpans(LN.lastname, FN.firstname) as personfrom FirstName FN,
-- relaxed version of Rule4a-- Yunyao: split the following rules into two to improve performance-- TODO: Test case for optimizer -- create view Person4ar1 as-- select CombineSpans(CP.name, FN.firstname) as person--from FirstName FN,-- CapsPerson CP--where FollowsTok(CP.name, FN.firstname, 1, 1)--and ContainsRegex(/,/,SpanBetween(CP.name, FN.firstname))--and Not(MatchesRegex(/(.|\n|\r)*(\.|\?|!|'|\sat|\sin)( )*/, LeftContext(CP.name, 10)))--and Not(MatchesRegex(/(?i)(.+fully)/, CP.name))--and GreaterThan(GetBegin(CP.name), 10);
create view Person4ar1temp asselect FN.firstname as firstname, CP.name as namefrom FirstName FN,
/*** Translation for Rule2** This rule will handles names of persons like B.M. Thomas David, where Thomas occurs in some person dictionary*//*<rule annotation=Person id=2><internal><token attribute={etc}>INITIALWORD</token><token attribute={etc}PERSON{etc}>CAPSPERSON</token><token attribute={etc}>CAPSPERSON</token></internal></rule>*/
create view Person2 asselect CombineSpans(IW.word, CP.name) as personfrom InitialWord IW,
PersonDict P,CapsPerson CP
where FollowsTok(IW.word, P.name, 0, 0)and FollowsTok(P.name, CP.name, 0, 0);
/*** Translation for Rule 2a** The rule handles names of persons like B.M. Thomas David, where David occurs in some person dictionary*//*<rule annotation=Person id=2a><internal><token attribute={etc}>INITIALWORD</token><token attribute={etc}>CAPSPERSON</token><token attribute={etc}>NEWLINE</token>?<token attribute={etc}PERSON{etc}>CAPSPERSON</token></internal></rule>*/
create view Person2a asselect CombineSpans(IW.word, P.name) as personfrom InitialWord IW,
CapsPerson CP,PersonDict P
where FollowsTok(IW.word, CP.name, 0, 0)and FollowsTok(CP.name, P.name, 0, 0);
/*** Translation for Rule 4r2** This relaxed version of rule '4' will find person names Thomas, David* But it only insists that the SECOND word is in some person dictionary*//*<rule annotation=Person id=4r2><token attribute={etc}>ANYWORD</token><internal><token attribute={etc}>CAPSPERSON</token><token attribute={etc}>NEWLINE</token>?<token attribute={etc}PERSON:ST:LNAME{etc}>CAPSPERSON</token></internal></rule>*/create view Person4r2 asselect CombineSpans(CP.name, LN.lastname) as personfrom CapsPerson CP,
/*** Translation for Rule 5** This rule will find other single token person first names*//* <rule annotation=Person id=5><internal><token attribute={etc}>INITIALWORD</token>?<token attribute={etc}PERSON:ST:FNAME{etc}>CAPSPERSON</token></internal></rule>*/create view Person5 asselect CombineSpans(IW.word, FN.firstname) as personfrom InitialWord IW,
/*** Translation for Rule 6** This rule will find other single token person last names*//* <rule annotation=Person id=6><internal><token attribute={etc}>INITIALWORD</token>?<token attribute={etc}PERSON:ST:LNAME{etc}>CAPSPERSON</token></internal></rule>*/
create view Person6 asselect CombineSpans(IW.word, LN.lastname) as personfrom InitialWord IW,
--==========================================================-- End of rules---- Create final list of names based on all the matches extracted----==========================================================
/*** Union all matches found by strong rules, except the ones directly come* from dictionary matches*/create view PersonStrongWithNewLine as
(select P.person as person from Person1 P)--union all-- (select P.person as person from Person1a_more P)union all
(select P.person as person from Person3 P)union all
(select P.person as person from Person4 P)union all
(select P.person as person from Person3P1 P);
create view PersonStrongSingleTokenOnly as(select P.person as person from Person5 P)
union all(select P.person as person from Person6 P)
union all(select P.firstname as person from FirstName P)
union all(select P.lastname as person from LastName P)
union all(select P.person as person from Person1a P);
-- Yunyao: added 05/09/2008 to expand person names with suffixcreate view PersonStrongSingleTokenOnlyExpanded1 asselect CombineSpans(P.person,S.suffix) as personfrom
PersonStrongSingleTokenOnly P,PersonSuffix S
where FollowsTok(P.person, S.suffix, 0, 0);
-- Yunyao: added 04/14/2009 to expand single token person name with a single initial-- extend single token person with a single initialcreate view PersonStrongSingleTokenOnlyExpanded2 as select CombineSpans(R.person, RightContext(R.person,2)) as personfrom PersonStrongSingleTokenOnly Rwhere MatchesRegex(/ +[\p{Upper}]\b\s*/, RightContext(R.person,3));
create view PersonStrongSingleToken as(select P.person as person from
PersonStrongSingleTokenOnly P)union all (select P.person as person from
PersonStrongSingleTokenOnlyExpanded1 P)union all (select P.person as person from
PersonStrongSingleTokenOnlyExpanded2 P);
/*** Union all matches found by weak rules*/create view PersonWeak1WithNewLine as
(select P.person as person from Person3r1 P)union all
(select P.person as person from Person3r2 P)union all
(select P.person as person from Person4r1 P)union all
(select P.person as person from Person4r2 P)union all
(select P.person as person from Person2 P)union all
(select P.person as person from Person2a P)union all
(select P.person as person from Person3P2 P)union all
(select P.person as person from Person3P3 P);
-- weak rules that identify (LastName, FirstName)create view PersonWeak2WithNewLine as
(select P.person as person from Person4a P)union all
(select P.person as person from Person4ar1 P)union all
create view PersonBase as(select P.person as person from
PersonStrongWithNewLine P)union all
(select P.person as person from PersonWeak1WithNewLine P)union all
(select P.person as person from PersonWeak2WithNewLine P);
output view PersonBase;
from Dictionary('names/name_israel.dict', Doc.text) Dwhere MatchesRegex(/\p{Lu}\p{M}*.{1,20}/, D.match);
create view NamesAll as(select P.name as name from NameDict P)union all(select P.name as name from NameDict1 P)union all(select P.name as name from NameDict2 P)union all(select P.name as name from NameDict3 P)union all(select P.name as name from NameDict4 P)union all(select P.firstname as name from FirstName P)union all
create view PersonDict asselect C.name as name--from Consolidate(NamesAll.name) C;from NamesAll Cconsolidate on C.name;
--==========================================================-- Actual Rules--==========================================================
-- For 3-part Person namescreate view Person3P1 as select CombineSpans(F.firstname, L.lastname) as personfrom StrictFirstName F,
/*** Translation for Rule 1* Handles names of persons like Mr. Vladimir E. Putin*//*<rule annotation=Person id=1><token attribute={etc}INITIAL{etc}>CANYWORD</token><internal><token attribute={etc}>CAPSPERSON</token><token attribute={etc}>INITIALWORD</token><token attribute={etc}>CAPSPERSON</token></internal></rule>*/
create view Person1 asselect CombineSpans(CP1.name, CP2.name) as personfrom Initial I,
/*** Translation for Rule 1a* Handles names of persons like Mr. Vladimir Putin*//* <rule annotation=Person id=1a><token attribute={etc}INITIAL{etc}>CANYWORD</token><internal><token attribute={etc}>CAPSPERSON</token>{1,3}</internal></rule>*/
-- Split into two rules so that single token annotations are serperated from others-- Single token annotationscreate view Person1a1 asselect CP1.name as personfrom Initial I,
CapsPerson CP1where FollowsTok(I.initial, CP1.name, 0, 0)--- start changing this block--- disallow allow newline and Not(ContainsRegex(/[\n\t]/,SpanBetween(I.initial,CP1.name)))--- end changing this block;
-- Yunyao: added 05/09/2008 to match patterns such as "Mr. B. B. Buy"/*create view Person1a2 as select CombineSpans(name.block, CP1.name) as personfrom Initial I,
and Not(ContainsRegex(/[\n\t]/,name.block))--- start changing this block-- disallow newlineand Not(ContainsRegex(/[\n\t]/,SpanBetween(I.initial,name.block)))
--- end changing this block;
*/
/*** Translation for Rule 3* Find person names like Thomas B.M. David*//*<rule annotation=Person id=3><internal><token attribute={etc}PERSON{etc}>CAPSPERSON</token><token attribute={etc}>INITIALWORD</token><token attribute={etc}PERSON{etc}>CAPSPERSON</token></internal></rule>*/
create view Person3 asselect CombineSpans(P1.name, P2.name) as personfrom PersonDict P1,
/*** Translation for Rule 3r1* * This relaxed version of rule '3' will find person names like Thomas B.M. David* But it only insists that the first word is in the person dictionary*//*<rule annotation=Person id=3r1><internal><token attribute={etc}PERSON:ST:FNAME{etc}>CAPSPERSON</token><token attribute={etc}>INITIALWORD</token><token attribute={etc}>CAPSPERSON</token></internal></rule>*/
create view Person3r1 as
create view Initial as
--'Junior' (Yunyao: comments out to avoid mismatches such as Junior National [team player],-- If we can have large negative dictionary to eliminate such mismatches, -- then this may be recovered --'Name:' ((Yunyao: comments out to avoid mismatches such as 'Name: Last Name')-- for German names-- TODO: need further test,'herr', 'Fraeulein', 'Doktor', 'Herr Doktor', 'Frau Doktor','Herr Professor', 'Frau professor', 'Baron', 'graf'
);
-- Find dictionary matches for all title initials
select D.match as initial--'Name:' ((Yunyao: comments out to avoid mismatches such as 'Name: Last Name')-- for German names-- TODO: need further test,'herr', 'Fraeulein', 'Doktor', 'Herr Doktor', 'Frau Doktor','Herr Professor', 'Frau professor', 'Baron', 'graf'
);
-- Find dictionary matches for all title initials
from Dictionary('InitialDict', Doc.text) D;
-- Yunyao: added 05/09/2008 to capture person name suffixcreate dictionary PersonSuffixDict as(
',jr.', ',jr', 'III', 'IV', 'V', 'VI');
create view PersonSuffix asselect D.match as suffixfrom Dictionary('PersonSuffixDict', Doc.text) D;
-- Find capitalized words that look like person names and not in the non-name dictionarycreate view CapsPersonCandidate asselect R.match as name--from Regex(/\b\p{Upper}\p{Lower}[\p{Alpha}]{1,20}\b/, Doc.text) R--from Regex(/\b\p{Upper}\p{Lower}[\p{Alpha}]{0,10}(['-][\p{Upper}])?[\p{Alpha}]{1,10}\b/, Doc.text) R -- change to enable unicode match--from Regex(/\b\p{Lu}\p{M}*[\p{Ll}\p{Lo}]\p{M}*[\p{L}\p{M}*]{0,10}(['-][\p{Lu}\p{M}*])?[\p{L}\p{M}*]{1,10}\b/, Doc.text) R --from Regex(/\b\p{Lu}\p{M}*[\p{Ll}\p{Lo}]\p{M}*[\p{L}\p{M}*]{0,10}(['-][\p{Lu}\p{M}*])?(\p{L}\p{M}*){1,10}\b/, Doc.text) R -- Allow fully capitalized words--from Regex(/\b\p{Lu}\p{M}*(\p{L}\p{M}*){0,10}(['-][\p{Lu}\p{M}*])?(\p{L}\p{M}*){1,10}\b/, Doc.text) R from RegexTok(/\p{Lu}\p{M}*(\p{L}\p{M}*){0,10}(['-][\p{Lu}\p{M}*])?(\p{L}\p{M}*){1,10}/, 4, Doc.text) R --'where Not(ContainsDicts(
-- Find strict capitalized words with two letter or more (relaxed version of StrictCapsPerson)
--============================================================--TODO: need to think through how to deal with hypened name -- one way to do so is to run Regex(pattern, CP.name) and enforce CP.name does not contain '-- need more testing before confirming the change
create view StrictLastName as(select S.lastname as lastname from StrictLastName1 S)union all(select S.lastname as lastname from StrictLastName2 S)union all(select S.lastname as lastname from StrictLastName3 S)union all(select S.lastname as lastname from StrictLastName4 S)union all(select S.lastname as lastname from StrictLastName5 S)union all(select S.lastname as lastname from StrictLastName6 S)union all(select S.lastname as lastname from StrictLastName7 S)union all(select S.lastname as lastname from StrictLastName8 S)union all(select S.lastname as lastname from StrictLastName9 S);
-- Relaxed version of last namecreate view RelaxedLastName1 asselect CombineSpans(SL.lastname, CP.name) as lastnamefrom StrictLastName SL,
(select N.lastname as lastname from StrictLastName N)union all(select N.lastname as lastname from RelaxedLastName1 N)union all(select N.lastname as lastname from RelaxedLastName2 N);
create view ValidLastNameAll asselect N.lastname as lastname
'firstname', 'Name', 'familyname',-- Italian greeting'Ciao',-- Spanish greeting'Hola',-- French greeting'Bonjour',-- new entries 'Pro','Bono','Enterprises','Group','Said','Says','Assis
'Adding', 'Acquire', 'Addition', 'America',-- short phrases that are likely to be at the start of a
sentence'Yes', 'No', 'Ja', 'Nein','Kein', 'Keine', 'Gegenstimme',-- TODO: to be double checked'Another', 'Anyway','Associate', 'At', 'Athletes', 'It',
'Enron', 'EnronXGate', 'Have', 'However','Company', 'Companies', 'IBM','Annual', -- common verbs appear with person names in
financial reports-- ideally we want to have a general comprehensive
verb list to use as a filter dictionary'Joins', 'Downgrades', 'Upgrades', 'Reports', 'Sees', 'Warns', 'Announces', 'Reviews'-- Laura 06/02/2009: new filter dict for title for SEC
'firstname', 'Name', 'familyname',-- Italian greeting'Ciao',-- Spanish greeting'Hola',-- French greeting'Bonjour',-- new entries 'Pro','Bono','Enterprises','Group','Said','Says','Assis
'Adding', 'Acquire', 'Addition', 'America',-- short phrases that are likely to be at the start of a
sentence'Yes', 'No', 'Ja', 'Nein','Kein', 'Keine', 'Gegenstimme',-- TODO: to be double checked'Another', 'Anyway','Associate', 'At', 'Athletes', 'It',
'Enron', 'EnronXGate', 'Have', 'However','Company', 'Companies', 'IBM','Annual', -- common verbs appear with person names in
financial reports-- ideally we want to have a general comprehensive
verb list to use as a filter dictionary'Joins', 'Downgrades', 'Upgrades', 'Reports', 'Sees', 'Warns', 'Announces', 'Reviews'-- Laura 06/02/2009: new filter dict for title for SEC