8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
1/19
!"#$%&'()*%+,*+%-./(0.'*%+,*+%1&(21.3$.4(51*$&$*$(6"%(7.8$.,-./((
9":;$%$()*+&-1'(
7>-?$@1*8()A(981.B(C8DEBFBG($.&(H.&%$(I1->()$%4$%B(C8DB(5JH)EBKBG((
!"#$%#&'()&'"*+$+,-*'-$.'/&-$0*-1)$-*'2,+#$,#3''
45#6-&%7#$%')('8#.+,+$#3'98+,&):+)*);
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
2/19
H.*%"&+,#$C-$I'-$.'
#E6*)'-=%)7-%#.'-66&)-,G#0'()&'0%&=,%=&+$;'%G#7'
DE-7+$#'6)%#$1-*'+76*+,-1)$0'()&'#$G-$,+$;',)76-&-1A#':+)*);+,-*'0%=.+#0'
4'
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
3/19
21.3$.4(7.*%L(
9'
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
4/19
21.3$.4(M0.'*%+,*+%1&N(51*$&$*$(
8"'*'K#*.' U$-%=&-*'P-0')66)0#.'%)'
*-:)&-%)&
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
5/19
P1>$*1&(Q"%4(
"-6%='G+;GQ*#A#*'G-:+%-%'-$.'#$A+&)$7#$%-*'7#%-.-%-'PX+&0,G7-$'#%'-*Y3'4NNZR' 2#7+Q-=%)7-%#.'+.#$1K,-1)$')('G-:+%-%'%#&70'+$'%G#'
+0)*-1)$[0)=&,#'K#*.'
?4Y?\'()=$.'%)':#')&;-$+07Q*-%#.'!'.,&(/0&*1%&2*300-0-*(%*0"(&'4(*%&5'36+7*'3-*'3'(%7)*63.%&7'8%3* D$-:*#',)76-&-1A#'7+,&):+)7#'G
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
6/19
R=1%=-1#("6(5$*1%-$>'($.&(51*8"&'(
GenBank
isolationsource A isolation
source Bisolationsource C
isolationsource
host
host A
host B
host C
organism
A organismBorganism
C
Exact Match
Basic Pattern
TNR Tool
N-Gram
NCBITaxonomy
NCBOAnnotator
UMLS
NCBO
31
2
!
`'
903:'32*;0$0'+0**
!``O'7+**+)$'#$%&+#0''
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
7/19
EA(H&1.
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
8/19
R=1%$>>(S"'*(P1'+>*'(
!39_N3N?N'%)%-*'-$.'4Z3eNM'=$+H=#'G)0%'A-*=#0' TUV()('%G#'A-*=#0',)=*.':#'7-66#.'%)'J"Cc'
/-E)$)7&-7 ' ' ' ' ' 'Pe?YeM\R' 'TUAWV' ''
Z'
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
9/19
7Z$:;>1(S"'*(R%/$.-':(5$;;-./'(
I93H(!$Z".":L( [(\$>+1'( 7Z$:;>1(\$>+1'( ];;%"$,8(
!"#"$%&'()*%$$
M8+:$.N(
c5f'e`N`'
/)%-*'g'_?_?MN'P?NY?\R'
@$+H=#'g'`Ne'
8+:$.(
S":"('$;-1.'h'-;#'??'
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
10/19
FA()*+&L-./(H'">$
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
11/19
R=1%$>>(H'">$-*#$3'CaDJ5S'/+00=#WD$T&)=63'3"&L(C$%*(
!!'
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
12/19
7Z$:;>1(H'">$
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
13/19
KA(7.$@>-./(9":;$%$
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
14/19
C"*1.(S"'*`);1,-a,(b+1%Lc)*+&L(
")76-')&;-$+07'0#H=#$,#0'):%-+$#.'(&)7'.+n#$%'@"&L(;$%*'()('%G#'+'70*/%+(*%&5'36+7*
'
'
!?'
!"#"$%&'()*%$ 6-%$#-%1-5-%$ 7"%(8&-.-%$
#0)6G-;=0'
#E%#&$-*'-=.+%)&
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
15/19
C"*1.(9%"''`);1,-1'(b+1%Lc)*+&L(
")76-')&;-$+07'0#H=#$,#0'):%-+$#.'(&)7'.+n#$%'@"&L('+@'*$.,1'('4&%++*/%+(*
%&5'36+7+*
'
!_'
!"#"$%&'()*%$ 6-%$#-%1-5-%$ 7"%$8&-.-%$
0-*+A-'
61,1'(
6*-07-'
0#&=7'
@>""&(
NY9!M'
NY4_e'
NY!``'
NY!?4'
NYN4M'
61,1'(
@>""&(
*""&(
7+*I'
0#&=7'
#E=.-%#'
NYe?M'
NYN4!'
NYN!?'
NYNN`'
NYNN?'
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
16/19
D-',+''-".(
D-&*
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
17/19
D-',+''-".(
")$1$=#'#E6*)&+$;'%G#'=0#')('J"C^'S$$)%-%)&'%)'-$$)%-%#'-'">$
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
18/19
8/7/2019 Elizabeth Chen, PhD - Towards Structuring Unstructured GenBank
19/19
!e'
G423%1$0-50703(+Q*
a-7-'q),G#&*-I)%-3'dG5'
J"C^'S$$)%-%)&'/#-7'
J-1)$-*'c$01%=%#0')('X#-*%G'PJ-1)$-*'k+:&-&