MeCC: Memory Comparison- based Clone Detector Heejung Kim 1 , Yungbum Jung 1 , Sunghun Kim 2 , and Kwangkeun Yi 1 1 Seoul National University 2 The Hong Kong University of Science and Technology 1 http://ropas.snu.ac.kr/mecc/
May 23, 2015
MeCC: Memory Comparison-based Clone Detector
Heejung Kim1, Yungbum Jung1, Sunghun Kim2, and Kwangkeun Yi11 Seoul National University
2 The Hong Kong University of Science and Technology
1
http://ropas.snu.ac.kr/mecc/
Code Clones
• similar code fragments (syntactically or semantically)
2
static PyObject *float_add(PyObject *v, PyObject *w){ double a,b; CONVERT_TO_DOUBLE(v,a); CONVERT_TO_DOUBLE(w,b); PyFPE_START_PROTECT(“add”,return 0) a = a + b; PyFPE_END_PROTECT(a) return PyFloat_FromDouble(a);}
static PyObject *float_mul(PyObject *v, PyObject *w){ double a,b; CONVERT_TO_DOUBLE(v,a); CONVERT_TO_DOUBLE(w,b); PyFPE_START_PROTECT(“multiply”,return 0) a = a * b; PyFPE_END_PROTECT(a) return PyFloat_FromDouble(a);}
Applications of Code Clones
3
• software refactoring
• detecting potential bugs
• understanding software evolution
• detecting software plagiarism (malicious duplication)
Clone Detectors• CCFinder [TSE’02]
textual tokens
• DECKARD [ICSE’07] AST characteristic vectors
• PDG-based [ICSE‘08, SAS’01] program dependence graph
4
Effective for syntactic code clones
limited for semantic code clones
Three code clones missed by syntax-based
clone detection
5
PyObject *PyBool_FromLong (long ok) { PyObject *result; if (ok) result = Py_True; else result = Py_False; Py_INCREF(result); return result;}
6
#1 Control Replacement
static PyObject *get_pybool (int istrue) { PyObject *result = istrue? Py_True: Py_False;
Py_INCREF(result); return result;}
syntactically different but semantically identical
void appendPQExpBufferChar (PQExpBuffer str, char ch) { /* Make more room if needed *. if (!enlargePQExpBuffer(str, 1)) return; /* OK, append the data */ str->data[str->len] = ch; str->len++; str->data[str->len] = ‘\0’;}
7
#2 Capturing Procedural Effects
understanding memory behavior of procedures
void appendBinaryPQExpBuffer (PQExpBuffer str, const char* data, size_t datalen) { /* Make more room if needed *. if (!enlargePQExpBuffer(str, datalen)) return; /* OK, append the data */ memcpy(str->data + str->len, data, datalen); str->len+= datalen; str->data[str->len] = ‘\0’;}
... *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf =
ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) {
return err;}conf->access_name = apr_pstrdup(cmd->pool,arg);return NULL;
}
... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf =
ap_get_module_config(cmd->server->module_config, &core_module); char *proto;
if (err != NULL) { return err;}proto = apr_pstrdup(cmd->pool,arg);ap_str_tolower(proto);conf->protocol = proto;return NULL;
} 8
#3 More Complex Clone
... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf =
ap_get_module_config(cmd->server->module_config, &core_module); char *proto;
if (err != NULL) { return err;}proto = apr_pstrdup(cmd->pool,arg);ap_str_tolower(proto);conf->protocol = proto;return NULL;
}
... *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf =
ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) {
return err;}conf->access_name = apr_pstrdup(cmd->pool,arg);return NULL;
}
statement reordering
9
�
... *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf =
ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) {
return err;}conf->access_name = apr_pstrdup(cmd->pool,arg);return NULL;
}
... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf =
ap_get_module_config(cmd->server->module_config, &core_module); char *proto;
if (err != NULL) { return err;}proto = apr_pstrdup(cmd->pool,arg);ap_str_tolower(proto);conf->protocol = proto;return NULL;
}
intermediate variables
statement reordering
10
... *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf =
ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) {
return err;}conf->access_name = apr_pstrdup(cmd->pool,arg);return NULL;
}
intermediate variables
statementsplitting
statement reordering
11
... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf =
ap_get_module_config(cmd->server->module_config, &core_module); char *proto;
if (err != NULL) { return err;}proto = apr_pstrdup(cmd->pool,arg);ap_str_tolower(proto);conf->protocol = proto;return NULL;
}
... *set_access_name(cmd_parms *cmd, void *dummy, const char *arg){ void *sconf = cmd->server->module_config; core_server_config *conf =
ap_get_module_config(sconf, &core_module); const char *err = ap_check_cmd_context(sconf,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); if (err != NULL) {
return err;}conf->access_name = apr_pstrdup(cmd->pool,arg);return NULL;
}
... *set_protocol(cmd_parms *cmd, void *dummy, const char *arg){ const char *err = ap_check_cmd_context(cmd,NOT_IN_DIR_LOC_FILE | NOT_IN_LIMIT); core_server_config *conf =
ap_get_module_config(cmd->server->module_config, &core_module); char *proto;
if (err != NULL) { return err;}proto = apr_pstrdup(cmd->pool,arg);ap_str_tolower(proto);conf->protocol = proto;return NULL;
} 12
intermediate variables
statementsplitting
statement reordering
13
These Semantic Clones are Identified by MeCC
MeCC: Our Approach
• Static analyzer estimates the semantics of programs
• Abstract memories are results of analysis
• Comparing abstract memories is a measure
14
Clone Detection Process
program
procedures
15
P1
P3 P4
P2
�P
program
procedures abstractmemoriesP1
P3 P4
P2
�P
Static Analyzer
F(�P ) = �M
Clone Detection Process
program
procedures
ComparingMemories
abstractmemories
similarities
S(M,M�)
17
P1
P3 P4
P2
�P
Static Analyzer
F(�P ) = �M
Clone Detection Process
program
procedures
ComparingMemories
Grouping
abstractmemories
similarities
S(M,M�)
18
P1
P3 P4
P2
P1
P3
P2
P4
Code Clones
�P
Static Analyzer
F(�P ) = �M
Clone Detection Process
program
procedures
ComparingMemories
Grouping
abstractmemories
similarities
S(M,M�)
19
Static Analyzer
P1
P3 P4
P2
P1
P3
P2
P4
Code Clones
�P
F(�P ) = �M
Clone Detection Process
Estimating Semantics by Abstract Memories
int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r;}
• Estimating an abstract memory at the procedure’s exit point
• Abstract memory is a map from abstract addresses to abstract values
20
Address Values
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
Estimating Semantics by Abstract Memories
int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r;}
• Estimating an abstract memory at the procedure’s exit point
• Abstract memory is a map from abstract addresses to abstract values
21
Address Values
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r;}
Address Values
22
Estimating Semantics by Abstract MemoriesS(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
• Use symbols for unknown input values
• All abstract values are guarded by execution path conditions
int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r;}
23
Estimating Semantics by Abstract Memories
• Use symbols for unknown input values
• All abstract values are guarded by execution path conditions
Address Values
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
copy and modify
24
Estimating Semantics by Abstract Memories
int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r;}
Address Values
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
int make2 (list2 *a, int b){ if (a==0) return b; a->n = malloc(...); a->n->v = b; return b + 2;}
25
Estimating Semantics by Abstract Memories
int make2 (list2 *a, int b){ if (a==0) return b; a->n = malloc(...); a->n->v = b; return b + 2;}
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
int make (list *a, int count){ int r = count + 1; if (a!=0){ a->next = malloc(...); a->next->val = count; } else { return r - 1; } return r;}
Address Valuescopy and modify
Address Values
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
program
procedures
ComparingMemories
Grouping
abstractmemories
similarities
S(M,M�)
26
P1
P3 P4
P2
P1
P3
P2
P4
Code Clones
�P
Static Analyzer
F(�P ) = �M
Clone Detection Process
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
�.val �.v
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
�.val �.v α.n
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α �= 0,β + 1− 1), (α = 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α �= 0,β), (α = 0,β + 1)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
Γ,∆ � E : τ ∆(y) = {x �→ τ loc}Γ,∆ � {x := E} : y
Γ,∆ � {} : x
Γ[t → t�/f ][t loc/x],∆ � E1 : t� Γ[t → t�/f ],∆ � E2 : t��
Γ,∆ � let procedure f(t x) : t� = E1 in E2 : t��
∆ � T1 : ∆1 ∆1 � T2 : ∆2
∆ � T1 T2 : ∆2
x1 �= x2
∆ � type x = {t1 x1, ..., tk xk} : ∆[{x1 �→ t1 loc, ..., xk �→ tk loc}/x]
1
field addresses
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
�.val �.v α.n
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
27
a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α �= 0,β + 1− 1), (α = 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α �= 0,β), (α = 0,β + 1)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
Γ,∆ � E : τ ∆(y) = {x �→ τ loc}Γ,∆ � {x := E} : y
Γ,∆ � {} : x
Γ[t → t�/f ][t loc/x],∆ � E1 : t� Γ[t → t�/f ],∆ � E2 : t��
Γ,∆ � let procedure f(t x) : t� = E1 in E2 : t��
∆ � T1 : ∆1 ∆1 � T2 : ∆2
∆ � T1 T2 : ∆2
x1 �= x2
∆ � type x = {t1 x1, ..., tk xk} : ∆[{x1 �→ t1 loc, ..., xk �→ tk loc}/x]
1
localvariables
Comparing Abstract Memories
1. Classifying addresses into similar classes
parameters
a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α �= 0,β + 1− 1), (α = 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α �= 0,β), (α = 0,β + 1)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
Γ,∆ � E : τ ∆(y) = {x �→ τ loc}Γ,∆ � {x := E} : y
Γ,∆ � {} : x
Γ[t → t�/f ][t loc/x],∆ � E1 : t� Γ[t → t�/f ],∆ � E2 : t��
Γ,∆ � let procedure f(t x) : t� = E1 in E2 : t��
∆ � T1 : ∆1 ∆1 � T2 : ∆2
∆ � T1 T2 : ∆2
x1 �= x2
∆ � type x = {t1 x1, ..., tk xk} : ∆[{x1 �→ t1 loc, ..., xk �→ tk loc}/x]
1
a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α �= 0,β + 1− 1), (α = 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α �= 0,β), (α = 0,β + 1)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
Γ,∆ � E : τ ∆(y) = {x �→ τ loc}Γ,∆ � {x := E} : y
Γ,∆ � {} : x
Γ[t → t�/f ][t loc/x],∆ � E1 : t� Γ[t → t�/f ],∆ � E2 : t��
Γ,∆ � let procedure f(t x) : t� = E1 in E2 : t��
∆ � T1 : ∆1 ∆1 � T2 : ∆2
∆ � T1 T2 : ∆2
x1 �= x2
∆ � type x = {t1 x1, ..., tk xk} : ∆[{x1 �→ t1 loc, ..., xk �→ tk loc}/x]
1
a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α �= 0,β + 1− 1), (α = 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α �= 0,β), (α = 0,β + 1)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
Γ,∆ � E : τ ∆(y) = {x �→ τ loc}Γ,∆ � {x := E} : y
Γ,∆ � {} : x
Γ[t → t�/f ][t loc/x],∆ � E1 : t� Γ[t → t�/f ],∆ � E2 : t��
Γ,∆ � let procedure f(t x) : t� = E1 in E2 : t��
∆ � T1 : ∆1 ∆1 � T2 : ∆2
∆ � T1 T2 : ∆2
x1 �= x2
∆ � type x = {t1 x1, ..., tk xk} : ∆[{x1 �→ t1 loc, ..., xk �→ tk loc}/x]
1
a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α �= 0,β + 1− 1), (α = 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α �= 0,β), (α = 0,β + 1)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
Γ,∆ � E : τ ∆(y) = {x �→ τ loc}Γ,∆ � {x := E} : y
Γ,∆ � {} : x
Γ[t → t�/f ][t loc/x],∆ � E1 : t� Γ[t → t�/f ],∆ � E2 : t��
Γ,∆ � let procedure f(t x) : t� = E1 in E2 : t��
∆ � T1 : ∆1 ∆1 � T2 : ∆2
∆ � T1 T2 : ∆2
x1 �= x2
∆ � type x = {t1 x1, ..., tk xk} : ∆[{x1 �→ t1 loc, ..., xk �→ tk loc}/x]
1
returnaddress
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
score 0.5
a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
Γ,∆ � E : τ ∆(y) = {x �→ τ loc}Γ,∆ � {x := E} : y
Γ,∆ � {} : x
Γ[t → t�/f ][t loc/x],∆ � E1 : t� Γ[t → t�/f ],∆ � E2 : t��
Γ,∆ � let procedure f(t x) : t� = E1 in E2 : t��
∆ � T1 : ∆1 ∆1 � T2 : ∆2
∆ � T1 T2 : ∆2
x1 �= x2
∆ � type x = {t1 x1, ..., tk xk} : ∆[{x1 �→ t1 loc, ..., xk �→ tk loc}/x]
2
{(α = 0,β + 1− 1), (α �= 0,β + 1)}
a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
Γ,∆ � E : τ ∆(y) = {x �→ τ loc}Γ,∆ � {x := E} : y
Γ,∆ � {} : x
Γ[t → t�/f ][t loc/x],∆ � E1 : t� Γ[t → t�/f ],∆ � E2 : t��
Γ,∆ � let procedure f(t x) : t� = E1 in E2 : t��
∆ � T1 : ∆1 ∆1 � T2 : ∆2
∆ � T1 T2 : ∆2
x1 �= x2
∆ � type x = {t1 x1, ..., tk xk} : ∆[{x1 �→ t1 loc, ..., xk �→ tk loc}/x]
2
score 1.0a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
Γ,∆ � E : τ ∆(y) = {x �→ τ loc}Γ,∆ � {x := E} : y
Γ,∆ � {} : x
Γ[t → t�/f ][t loc/x],∆ � E1 : t� Γ[t → t�/f ],∆ � E2 : t��
Γ,∆ � let procedure f(t x) : t� = E1 in E2 : t��
∆ � T1 : ∆1 ∆1 � T2 : ∆2
∆ � T1 T2 : ∆2
x1 �= x2
∆ � type x = {t1 x1, ..., tk xk} : ∆[{x1 �→ t1 loc, ..., xk �→ tk loc}/x]
2
Comparing Abstract Memories
2. Compare guarded values in the same similar classes (score 0.0 to 1.0)
28
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
Comparing Abstract Memories
3. Find the best combination that maximizes the total score
{(true,α)}
(4× 1.0 + 1× 0.0 + 4× 1.0 + 2× 0.5)
6 + 5= 0.82
maximum score
| M1 | + | M2 |
| F(c)− F(c�) |
i := 0;while i < 10
b := random_bool();if b then i := i + 1;
end
k = 0 j = 1k = 1 j = 2 + 1k = 2 j = 2 + 2 + 1... ...
{n ≥ 0} x = bar(n) {x = 1 + 2n}{I ∧B} S {I}
{I} while B S {I ∧ ¬B}
{P ∧B} S1 {Q} {P ∧ ¬B} S2 {Q}{P} if B then S1 else S2 {Q}
{y + 7 > 42} x := y + 7 {x > 42}
{y = x+ 10 ∧ x = 1} x := y + 7 {y = x� + 10 ∧ x� = 1 ∧ x := x� + 10 + 7}
{y = y + 7− 7 ∧ y + 7 = 18} x := y + 7 {y = x− 7 ∧ x = 18}
{P} x := E {P [x�/x] ∧ (x = E[x�/x])}
{y + 7 > 42} x = y + 7 {x > 42}
{x ≥ 0 ∧ y ≥ 0} z := x+ y {z ≥ 0}
{x ≥ 0 ∧ y ≥ 0} z := x+ y {z ≥ 0 ∧ x ≥ 0 ∧ y ≥ 0}
{x = 5 ∧ y = 2} z := x+ y {x = 5 ∧ z = 7}
{n ≥ 0 ∧ n2 > 28} m := n+ 1;m := m ∗m {¬(m = 36)}
{∀i.a[i] = 10 ∧ k ≥ 0} a[k] = 0 {∀i.a[i] = 10 ∧ k ≥ 0}
F(�c) = �M
S(M,M�)
1
{(true,α)}
(4× 1.0 + 1× 0.0 + 4× 1.0 + 2× 0.5)
6 + 5= 0.82
S(M1,M2) =maximum score
| M1 | + | M2 |
| F(c)− F(c�) |
i := 0;while i < 10
b := random_bool();if b then i := i + 1;
end
k = 0 j = 1k = 1 j = 2 + 1k = 2 j = 2 + 2 + 1... ...
{n ≥ 0} x = bar(n) {x = 1 + 2n}{I ∧B} S {I}
{I} while B S {I ∧ ¬B}
{P ∧B} S1 {Q} {P ∧ ¬B} S2 {Q}{P} if B then S1 else S2 {Q}
{y + 7 > 42} x := y + 7 {x > 42}
{y = x+ 10 ∧ x = 1} x := y + 7 {y = x� + 10 ∧ x� = 1 ∧ x := x� + 10 + 7}
{y = y + 7− 7 ∧ y + 7 = 18} x := y + 7 {y = x− 7 ∧ x = 18}
{P} x := E {P [x�/x] ∧ (x = E[x�/x])}
{y + 7 > 42} x = y + 7 {x > 42}
{x ≥ 0 ∧ y ≥ 0} z := x+ y {z ≥ 0}
{x ≥ 0 ∧ y ≥ 0} z := x+ y {z ≥ 0 ∧ x ≥ 0 ∧ y ≥ 0}
{x = 5 ∧ y = 2} z := x+ y {x = 5 ∧ z = 7}
{n ≥ 0 ∧ n2 > 28} m := n+ 1;m := m ∗m {¬(m = 36)}
{∀i.a[i] = 10 ∧ k ≥ 0} a[k] = 0 {∀i.a[i] = 10 ∧ k ≥ 0}
F(�c) = �M
S(M,M�)
1
29
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a �→ {(true,α)}count �→ {(true,β)}r �→ {(true,β + 1)}α.next �→ {(α �= 0, �)}�.val �→ {(α �= 0,β)}RETV �→ {(α = 0,β + 1− 1), (α �= 0,β + 1)}
a �→ {(true,α)}b �→ {(true,β)}α.n �→ {(α �= 0, �)}�.v �→ {(α �= 0,β)}RETV �→ {(α = 0,β), (α �= 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
2
≥ 0.8
Experimental Results
30
Subject Projects
31
Projects KLOC Procedures Application
Python 435 7,657 interpreter
Apache 343 9,483 web server
PostgreSQL 937 10,469 database
Detected ClonesTotal 623
code clones6%
39%53%
2%
Type-1 Type-2Type-3 Type-4
C. K. Roy and J. R. Cordy. A survey on software clone detection research. SCHOOL OF COMPUTING TR 2007-541, QUEENʼS UNIVERSITY, 115, 2007.
Semantic Clones45% Total 623
code clones6%
39% 53%
2%
Type-1 Type-2Type-3 Type-4
ComparisonCCfinder
PDG-based
DECKARD
MeCC
0 75 150 225 300
DECKARDcharacteristic vectors
PDG-basedprogram
dependency graphs
34
CCfinder
PDG-based
DECKARD
MeCC
0 10 20 30 40Type-4
CCfindertextual tokens
Type-3
Applications of Code Clones
35
• software refactoring
• detecting potential bugs
• understanding software evolution
• detecting software plagiarism (malicious duplication)
Finding Potential Bugs
• A large portion of semantic clones are due to inconsistent changes
• Inconsistent changes may lead to potential bugs (inconsistent clones)
36
Two semantic clones with potential bugs
#1 Missed Null Check
37
const char *GetVariable (VariableSpace space, const char *name){ struct_variable *current; return NULL; for (current=space->next;current;current=current->next) { if (strcmp(current->name,name) == 0) { return current->value; } } return NULL;}
const char *PQparameterStatus (const PGconn *conn, const char *paramName){ const pgParameterStatus *pstatus; if (!conn || !paramName) return NULL; for (pstatus=conn->pstatus; pstatus!=NULL; pstatus = pstatus->next) { if (strcmp(pstatus->name,paramName)== 0) return pstatus->value; } return NULL;}
if (!space) parameter name also should be checked!
38Python project revision #20157
A resource leak without endpwent() procedure call
#2 A Resource Leak BugPyObject *pwd_getpwall (PyObject *self){ PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); return NULL; } Py_DECREF(v); } endpwent(); return d;}
open user database
close user database
PyObject *pwd_getpwall (PyObject *self){ PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); return NULL; } Py_DECREF(v); } endpwent(); return d;}
39
A Bug-free Procedure
Python project revision #38359
PyObject *spwd_getspall (PyObject *self, PyObject *args){ PyObject *d; struct spwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setspent(); while ((p = getspent()) != NULL) { PyObject *v = mkspent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); endspent(); return NULL; } Py_DECREF(v); } endspent(); return d;}
40
PyObject *pwd_getpwall (PyObject *self){ PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); return NULL; } Py_DECREF(v); } endpwent(); return d;}
The Bug is Fixed Later
Python project revision #73017
bug-fixedendpwent();
PyObject *spwd_getspall (PyObject *self, PyObject *args){ PyObject *d; struct spwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setspent(); while ((p = getspent()) != NULL) { PyObject *v = mkspent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); endspent(); return NULL; } Py_DECREF(v); } endspent(); return d;}
41
revision #20157
revision #38359
revision #73017
Procedure A was created with a resource leak
Procedure B (a code clone of A)is introduced
without resource leaks
The resource leak bug in procedure A is fixed
the resource leak can be fixedif MeCC were applied
4 years
42
MeCC successfully identifies these procedures
const char *GetVariable (VariableSpace space, const char *name){ struct_variable *current; if (!space) return NULL; for (current=space->next;current;current=current->next) { if (strcmp(current->name,name) == 0) { return current->value; } } return NULL;}
const char *PQparameterStatus (const PGconn *conn, const char *paramName){ const pgParameterStatus *pstatus; if (!conn || !paramName) return NULL; for (pstatus=conn->pstatus; pstatus!=NULL; pstatus = pstatus->next) { if (strcmp(pstatus->name.paramName)== 0) return pstatus->value; } return NULL;}
PyObject *pwd_getpwall (PyObject *self){ PyObject *d; struct passwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setpwent(); while ((p = getpwent()) != NULL) { PyObject *v = mkpwent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); return NULL; } Py_DECREF(v); } endpwent(); return d;}
PyObject *spwd_getspall (PyObject *self, PyObject *args){ PyObject *d; struct spwd *p; if ((d = PyList_New(0)) == NULL) return NULL; setspent(); while ((p = getspent()) != NULL) { PyObject *v = mkspent(p); if (v==NULL || PyList_Append(d,v)!=0) { Py_XDECREF(v); Py_DECREF(d); endspent(); return NULL; } Py_DECREF(v); } endspent(); return d;}
Potential Bugs and Code Smells
43
#Semantic Clones
PotentialBugs (%)
Code Smells (%)
Python 95 26 (27.4%) 23 (24.2%)
Apache 81 8 ( 9.9%) 27 (33.3%)
PostgreSQL 102 21 (20.6%) 20 (19.6%)
Total 278 55 (19.8%) 70 (25.2%)
detected by MeCC
Study Limitation
• Projects are open source and may not be representative
• All clones are manually inspected
• Default options are used for other tools (CCfinder, Deckard, PDG-based)
44
Conclusion • MeCC: Memory Comparison-based Clone
Detector
• a new clone detector using semantics-based static analysis
• tolerant to syntactic variations
• can be used to find potential bugs
45
Thank You!
46
http://ropas.snu.ac.kr/mecc/
Backup Slides
47
Time Spent Projects KLOC FP Total Time
Python 435 39 264 1h
Apache 343 24 191 5h
PostgreSQL 937 47 278 7h
• False positive ratio is less than 15%
• Slower than other tools (deep semantic analysis)
Ubuntu 64-bit machine with a 2.4 GHz Intel Core 2 Quad CPU and 8 GB RAM.
48
Structure Initialization
49
Structure Initialization
50
Judgement of Clones
• Two parameters
• In our experiment, similarity threshold 0.8 is used
• Penalty function for small size of code clones
S(M1,M2)
log MinEntry
log(| M1 | + | M2 |)
2(2 · 1.0 + 2 · 1.0 + 1 · 0.5)/(6 + 5) = 0.82
a {(true,α)}count {(true,β)}r {(true,β + 1)}α.next {(α �= 0, �)}α.val {(α �= 0,β)}RETV {(α �= 0,β + 1− 1), (α = 0,β + 1)}
a {(true,α)}b {(true,β)}α.n {(α �= 0, �)}α.v {(α �= 0,β)}RETV {(α �= 0,β), (α = 0,β + 2)}
{}, {} � P ⇓ v,M
{}, {} � P : τ
type list = {int x, list next}
let list node = {x:=1, next:={}}in
node.next.x
let x := {a:=1, b:=2} in E
type list = {int x, list next}type tsil = {int x, tsil prev}
let ... {x:=1, next:={}}... {x:=1, prev:={}}
...
∆ � T ∗ : ∆�
Γ,∆ � E : τ
∅ � T ∗ : ∆ ∅,∆ � E : τ∅,∅ � T ∗ E : τ
1
51
Static Analyzer
• Flow-sensitive
• Context-sensitive by procedural summaries
• Path-sensitive
• Abstract interpretation
52
http://spa-arrow.com