Diff.html
48.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
<?xml version="1.0" encoding="iso-8859-1"?>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta name="generator" content=
"HTML Tidy for Mac OS X (vers 31 October 2006 - Apple Inc. build 13), see www.w3.org" />
<title>
RDF Diff, Patch, Update, and Sync -- Design Issues
</title>
<style type="text/css">
/*<![CDATA[*/
.definition {text-align: right}
.proposition {text-align: right}
/*]]>*/
</style>
<link rel="Stylesheet" href="di.css" type="text/css" />
<link rel="Stylesheet" href="lncs04/article.css" type=
"text/css" />
</head>
<body xml:lang="en" lang="en">
<div class="online">
<a href="./">Up to Design Issues</a>
</div>
<div class="maketitle">
<h1 class="title">
Delta: an ontology for the distribution of differences
between RDF graphs
</h1>
<address>
<a rel="author" href=
"http://www.w3.org/People/Berners-Lee/">Tim Berners-Lee</a>
and <a rel="author" href=
"http://www.w3.org/People/Connolly/">Dan Connolly</a>,
<a rel="institute" href="http://www.csail.mit.edu/">MIT
Computer Science and Artificial Intelligence Laboratory
(CSAIL)</a><br />
<span class="thanks">This work is supported in part by
funding from US Defense Advanced Research Projects Agency
(DARPA) and Air Force Research Laboratory, Air Force
Materiel Command, USAF, under agreement number
F30602-00-2-0593, <q>Semantic Web
Development</q>.</span><br />
<span class="online">Created: 2001, current: $Revision:
1.114 $ of <!--linebreak-->
$Date: 2009/08/27 21:38:06 $</span><br />
<span class="online">Status: personal view only. Editing
status: rough. 2004/03: Extended to add pointers to
implementations, and details of actual language used. see
also: <a href=
"http://lists.w3.org/Archives/Team/sw-team/2004Jul/0008">comments
from reviewers</a></span>
</address>
<p class="online">
Keywords: RDF, Difference, patch, remote update,
synchronization, graph comparison.
</p>
</div>
<hr />
<div class="abstract">
<h4>
Abstract
</h4>
<p>
The problem of updating and synchronizing data in the
Semantic Web motivates an analog to text diffs for RDF
graphs. This paper discusses the problem of comparing two
RDF graphs, generating a set of differences, and updating a
graph from a set of differences. It discusses two forms of
difference information, the context-sensitive <q>weak</q>
patch, and the context-free <q>strong</q> patch. It gives a
proposed <strong>update ontology</strong> for patch files
for RDF, and discusses experience with proof of concept
code.
</p>
</div>
<h2>
Introduction
</h2>
<p>
The use of text files to record programs, documents, and
other artifacts is supported by version control systems such
as RCS<a href="#Tich85">[Tich85]</a> and CVS<a href=
"#Ber90">[Ber90]</a> that are based on the ability to compute
the difference between two text files and represent it as
diff<a href="#Mill85">[Mill85]</a>, i.e. a set of editing
instructions. The use of database tables to record bank
accounts and records of all sorts is supported by the
relational calculus<a href="#Codd70">[Codd70]</a> and its
expression as SQL statements. In both cases, the data goes
thru a sequence of states; not only are the states
represented explicitly (as text files or database tables) but
also the transitions from one state to the other can be
represented explicitly (either as editing instructions or SQL
insert/update statements). Difference (<samp>\Delta</samp>)
and sum (<samp>\Sigma</samp>) functions are ubiquitous in
computing and, like differentiation and integration, are
inverse in the sense that:
</p>
<p class="eqn-display">
v1 = <samp>\Sigma</samp>(v0, <samp>\Delta</samp>(v0, v1))
</p>
<p>
Since the transitions can be represented much more compactly
than the pairs of states, and the sigma function is
straightforward to compute, the deltas are useful for
efficiently updating data distributed among two or more
peers.
</p>
<p>
We are developing a Semantic Web Application Platform
(<a href="/2000/10/swap/">SWAP</a>) including tools and
applications to manipulate RDF graphs much like traditional
tools manipulate text files. It includes <code>cwm</code>, a
command-line tool for processing RDF in both the standard XML
encoding<a href="#RDF04">[RDF04]</a> and an experimental
encoding, Notation3 (n3)<a href="#Ber03">[Ber03]</a>.
</p>
<p>
As we build the Semantic Web, using RDF graphs<a href=
"#RDFC04">[RDFC04]</a> to represent data such as
bibliographies<a href="#DC02">[DC02]</a>, syndication
summaries<a href="#RSS">[RSS]</a> and medical
terminology<a href="#Gol03">[Gol03]</a>, we see a need for
difference and sum functions for RDF graphs. The use of RDF
to represent test results<a href="#EARL">[EARL]</a>,<a href=
"#OWLT">[OWLT]</a> motivates better ways to compare the
actual results of software tests with the intended results
and isolate the differences.
</p>
<h3>
<a name="Synchroniz" id="Synchroniz">The Synchronization
Problem</a>
</h3>
<p>
One of the most stubborn problems in practical computing is
that of synchronizing calendars and address books between
different devices. Various combinations of device and
program, from the same or different manufacturers, produce
very strange results on not-so-rare occasions.
</p>
<p>
The problem has three parts. There is the syntactic problem
of extracting the data from the strange device or its storage
medium and turning into something manageable, such as RDF.
There is the semantic problem of understanding what the
fields mean: can one have two home phone numbers? There is
the problem of actually synchronizing changes, particularly
in the general case that changes have been made on both
devices.
</p>
<p>
Because the direct syntactic conversion to RDF often leaves
something which has strained and awkward semantics, it is
often necessary or tempting to mix the semantic and syntactic
conversions. <span class="online">(See <a href=
"/2002/12/cal/" class="online">RDF calendaring</a>
discussions.)</span> Because the merging of changes requires
more application knowledge than the bare RDF data provides,
it is tempting to mix the conversion and sync algorithm.
However, this mixing reduces the modularity and testability
of the resulting program. Perhaps if the three stages were
separated, then a more robust system, and one more extensible
by the addition of information in new ontologies, would
result.
</p>
<p>
In the semantic web architecture, the application constraints
on the data can be represented in the ontology, and so can be
used by a a generic synchronization system.
</p>
<p>
On the one hand, the syntactic problems are straightforward,
if tedious, and the much harder semantic problems may explain
why many existing synchronization packages break down. But on
the other hand, perhaps it is the combination of the two that
result in so many failures; perhaps software that separates
the problems, treating synchronization generically, will be
more robust. We hope this work contributes to further work on
specifications such as SyncML<a href="#Sync02">[Sync02]</a>.
</p>
<p>
And while in the general case, concurrent changes may be
completely irreconcilable, the diff mechanisms discussed here
solve an interesting part of the problem space.
</p>
<h3>
Problems with the line-oriented approach
</h3>
<p>
RDF graphs can be serialized and used with traditional
line-oriented tools. In the general case, with no constraints
on how the graphs are serialized, line-oriented deltas can be
as large as the data itself, even between files representing
the same graph. However, when files are edited by hand, small
changes to the data naturally result in small textual diffs.
But since the difference is expressed as the difference
between two text files, not the difference between two
graphs, the delta is dependent on the graph serialization.
It's not enough to have the original graph to use the delta;
one needs a copy of the particular serialization.
</p>
<p>
Pretty-printing algorithms reduce the large number of
possible serializations of an RDF graph to a few actual
serializations. The difference engine<a href=
"#Kly04">[Kly04]</a> produces human-readable difference
descriptions using an algorithm analogous to comparing
pretty-printed graphs; its descriptions are not sufficient to
reconstruct one graph from the other, however.
</p>
<p>
We find it practical to use CVS to manage both hand-edited
and machine-serialized RDF data files in many cases. A
notable exception is the reference results for tests:
comparison of experimental test results versus reference
results yield many false test failures every time we change
the pretty-printing algorithm in the slightest. The cost of
managing the reference results this way is barely tolerable.
</p>
<p>
The straightforward pretty-printing algorithm works in the
obvious way when all the nodes are named (either with URIs or
literals): triples are sorted by subject, and those that
share a subject are grouped together. Notation3 has syntax
for grouping triples that shared predicates. Unlabeled nodes
(<em>blank nodes</em> or <em>bnodes</em>) that have no
incoming triples are treated like named subjects. Bnodes that
have one incoming link serve as internal nodes in the
pretty-printing tree. Bnodes that have more than one incoming
triple are given arbitrary labels for the purpose of
serialization and are hence treated like named subjects. For
example, the triples
</p>
<pre class="example">
:Bob :pet _:p.
_:p :size "small".
:Bob :brother :Pete.
_:p :mother _:p2.
:Pete :pet _:p2.
</pre>
<p>
are pretty-printed as
</p>
<pre class="example">
:Bob :brother :Pete;
:pet [
:mother _:g0;
:size "small" ] .
:Pete :pet _:g0 .
</pre>
<p>
The ordering and the identification of bnodes are the two
ways which serializations of the same graph can arbitrarily
differ. <code>Cwm</code> not only attempts to find a
serialization which minimizes the number of arbitrarily named
nodes but often happens to regenerate arbitrary names
consistently across runs. Even so, diffs of pretty-printed
RDF are still unsatisfactory, since changes as small as one
triple can lead to arbitrarily large textual diffs if that
triple changes the set of bnodes that need arbitrary labels.
</p>
<p>
To completely eliminate the arbitrary choices in how to
serialize an RDF graph, we could employ a canonicalization
algorithm such as the one<a href="#Car03s">[Car03s]</a> in
Jena<a href="#Car03">[Car03]</a>, or <a href=
"/2000/10/swap/cant.py">cant.py</a> from our own SWAP
toolkit. One problem with this approach is that the canonical
form is expressed in the N-Triples<a href=
"#RDFT04">[RDFT04]</a> representation. Deltas between
N-Triples files are verbose and tedious to read for most
practical graphs. Further, the problem of large textual diffs
resulting from small changes remains: these canonicalization
algorithms work by computing a signature for each blank node
based on nearby triples and sorting the results; adding or
removing one triple near a blank node will change its
signature and hence potentially the labeling of many bnodes.
</p>
<h2>
Goals: Economy and Robustness
</h2>
<p>
SQL statements and text file diffs are attractive because
they succinctly represent the difference between two states.
If the difference between two text files were not much
smaller than either of the text files, it would be of little
use. The essential feature of a difference algorithm, then,
is <em>economy</em>: small differences between input states
should result in small deltas.
</p>
<p>
Much of the popularity of CVS is due to its support of
concurrent development. It makes a patch file<a href=
"#Wall">[Wall]</a> representing the changes each party has
made. These changes are made, in order, to the repository
file to generate new versions. In the event that two agents
take a copy of the same version <samp>v0</samp> and make
different changes to it (<samp>v1a</samp> and
<samp>v1b</samp>), the party that commits last attempts to
make <samp>v1</samp> which incorporates both diffs:
</p>
<p class="eqn-display">
v1 = <samp>\Sigma</samp>(<samp>\Sigma</samp>(v0,
<samp>\Delta</samp>(v0, v1a)), <samp>\Delta</samp>(v0, v1b))
</p>
<p>
Note that <samp>\Delta(v0, v1b)</samp> is applied to
something other than v0. The context diff and unidiff formats
are sufficiently robust that it does work in most practical
cases. When it does not work, then the user is left with the
problem of manually reconciling the conflicts. This happens
when, for example, one party moves the date of a meeting at
the same time as someone else moves or deletes the meeting.
It may be that the criterion that a problem needs human
involvement is very application-dependent.
</p>
<p>
There are thee failure modes:
</p>
<ol>
<li>Inconsistent changes were made. This failure mode is not
automatically soluble.
</li>
<li>The patch was incapable of finding the appropriate points
in v1a at which to make the change <samp>\Delta</samp>(v0,
v1b). This form of failure we can eliminate for certain RDF
graph deltas.
</li>
<li>The patch was misapplied: the context was used to
determine points at which to make the change, but the wrong
point was used, and erroneous data resulted. This is
unacceptable.
</li>
</ol>
<p>
A <em>robust</em> patch is one which may be applied so a file
different to the one it was originally generated from,
without being misapplied and hence generating erroneous
information. In the line oriented tools, the <em>patch</em>
program was introduced to be more robust than simply applying
the patch as a series of editor commands.
</p>
<h2>
Delta and Sigma for RDF Graphs
</h2>
<p>
An RDF graph is a set of (subject, predicate, object)
triples, i.e. a set of typed links between nodes. Each node
may or may not be named (either by a URI or a literal). As a
measure of the size of the difference between two RDF graphs
<samp>G1</samp> and <samp>G2</samp>, one can use the sum of
the size of the set differences <samp>|G1-G3|</samp> and
<samp>|G2-G3|</samp> where <samp>G3</samp> is the largest
common subgraph of <samp>G1</samp> and of <samp>G2</samp>.
</p>
<h3>
Computing differences between RDF graphs
</h3>
<p>
In the case in which all the nodes are named, computing the
difference between two graphs is simple and straightforward:
</p>
<p class="definition">
If <samp>G1</samp> and <samp>G2</samp> are ground RDF graphs,
then the <em>ground graph delta</em> of <samp>G1</samp> and
<samp>G2</samp> is a pair <samp>(insertions,
deletions)</samp> where <samp>insertions</samp> is the set
difference <samp>G2-G1</samp> and <samp>deletions</samp> is
<samp>G1-G2</samp>.
</p>
<p>
This form of delta is reasonably economical: the storage cost
is linear in the size of the difference between the graphs.
Straightforward extensions with slightly improved economy
might be more specific in expressing differences in which
only one or two parts of the triple have changed.
</p>
<p>
It is also completely robust. Each statement is independent,
with no variables: there is no cause for ambiguity. The
deletion statements may be deleted from, and the insertion
statements added to, any graph.
</p>
<p>
In the case where not all of the nodes are named, finding the
largest common subgraph becomes a case of the graph
isomorphism problem. The arc labels do have names (in a very
large set of practical cases, including all those which can
be serialized as RDF/XML). Graph isomorphism is in fact a
class of difficult problem that cannot be solved in
polynomial time but which has not been shown to be NP
complete<a href="#Kob93">[Kob93]</a>. While the general graph
isomorphism problem has readily available solutions<a href=
"#Ski97">[Ski97]</a><a href="#Ski01">[Ski01]</a>, they do not
seem to be a good match for the practical cases of RDF graph
diff.
</p>
<p>
There is an interesting subset of real cases in which there
are a mixture of named and unnamed nodes, but none of the
unnamed nodes is very far from a named node. In this case,
the unnamed nodes can be indirectly identified by giving a
path from a named node. The difference is then expressed by
giving this local context and the related changes.
</p>
<h3>
A patch file format for RDF deltas
</h3>
<p>
By analogy to the text diff, there is a need not only for a
difference-finding algorithm, but for a patch file format.
Such a format needs:
</p>
<ul>
<li>a way to uniquely identify what is changing
</li>
<li>a way to distinguish between the pieces added and those
subtracted
</li>
</ul>
<p>
It is straightforward to pinpoint the parts of the graph that
have changed when all nodes are named, but less so in the
presence of anonymous nodes.
</p>
<p>
To identify what is changing, we use Notation3 expressions
for quoted RDF graphs with schema variables, and we introduce
three new terms. For example:
</p>
<pre class="example">
@prefix diff: <http://www.w3.org/2004/delta#>.
{ ?x bank:accountNo "1234578"; bank:balance 4000}
diff:replacement
{ ?x bank:accountNo "1234578"; bank:balance 3575}.
</pre>
<p>
This one new property <code>replacement</code> can express
any change. Deletions can be written <code>{...}
diff:replacement {}</code> and additions can be written
<code>{} diff:replacement {...}</code>.
</p>
<p>
The second alternative is very similar but involves two
properties, one for inserting and one for deleting:
</p>
<pre class="example">
{ ?x bank:accountNo "1234578"}
diff:deletion { ?x bank:balance 4000};
diff:insertion { ?x bank:balance 3575}.
</pre>
<p>
The form using <code>diff:insertion</code> and
<code>diff:deletion</code> is implemented in <a href=
"/2000/10/swap/doc/cwm">cwm</a>.
</p>
<p>
The first and second form are related by
</p>
<pre class="definition">
{ ?F replacement ?G } <=> { ?F deletion ?F; insertion ?G }
</pre>
<h3>
Weak and Strong diffs
</h3>
<p>
To address robustness, we distinguish two types of RDF graph
deltas: a <em>weak</em> delta gives enough information to
apply it to exactly the graph it was computed from, but a
<em>strong</em> delta specifies the changes in a
context-independent manner. The difference is not in the
patch file format, but in the information a particular patch
gives.
</p>
<p>
Returning to the bank example, if bank account numbers are
globally unique, then the replacement pattern will bind ?x to
a node identifying a particular bank account. In OWL<a href=
"#OWL">[OWL]</a> terms, if <code>bank:accountNumber</code> is
an <code>owl:InverseFunctionalProperty</code>, then the node
must be the <code>owl:sameAs</code> any other node with the
same account number. In that case, the patch will be strong.
</p>
<p>
If, however, many accounts can have the same number, applying
that patch to another knowledge base may inadvertently alter
the wrong account. The patch would be weak.
</p>
<p>
In normal information processing, of course, numbers such as
bank account numbers are used to avoid this confusion.
Consider those graphs in which every blank node is in fact
unambiguously identified by one functional or inverse
functional property. Further, that property is invariant
under any changes represented by the deltas.
</p>
<p>
The pattern for terms goes as follows:
</p>
<p class="definition">
Given a background ontology <samp>W</samp> and a graph
<samp>G</samp>, if a blank node <samp>b</samp> in
<samp>G</samp> is the object of a triple whose subject
<samp>v</samp> is <em>functionally ground</em> and whose
predicate <samp>p</samp> is an
<code>owl:FunctionalProperty</code> according to
<samp>W</samp>, then <samp>v.p</samp> is a <em>functional
term label</em> for <samp>b</samp> in <samp>G</samp> with
respect to <samp>W</samp>. Likewise, <samp>v\uparrow q</samp>
is a functional term label for <samp>b</samp> if
<samp>q</samp> is an
<code>owl:InverseFunctionalProperty</code>, b is the subject,
and v is the object. Recursively, v is functionally ground if
it is a name (URI or literal) or a bnode with a functional
term label.
</p>
<p>
Then we can rewrite certain graphs:
</p>
<p class="definition">
With respect to a background ontology <samp>W</samp>, a graph
<samp>G</samp> is <em>fully labeled</em> iff every node in
<samp>G</samp> is functionally ground. A <em>functional RDF
graph</em> is a set of triples whose terms are URIs,
literals, or functional terms. A functional RDF graph
<samp>F</samp> is a <em>functional analog</em> of an RDF
graph <samp>G</samp> iff <samp>G</samp> is fully labeled and
<samp>F</samp> can be obtained from <samp>G</samp> by
replacing each bnode b in <samp>G</samp> with a functional
term label for b.
</p>
<p>
The diffs of functional RDF graphs are just as simple to make
as ground RDF deltas:
</p>
<p class="definition">
Given a background ontology <samp>W</samp>, a <em>strong</em>
delta between fully labeled graphs <samp>G1</samp> and
<samp>G2</samp> is a pair <samp>(insertions,
deletions)</samp> where <samp>insertions</samp> is the set
difference <samp>F2-F1</samp>, deletions is
<samp>F1-F2</samp>, and <samp>F1</samp> and <samp>F2</samp>
are functional analogs of <samp>G1</samp> and <samp>G2</samp>
respectively.
</p>
<p class="online">
(@@need to define sigma for strong deltas?) It is actually
the same as for any delta: horn match and delete or insert.
</p>
<p>
A strong delta is like a context diff that cannot be
mis-applied.
</p>
<p class="proposition">
If <samp>D</samp> is a strong delta between fully labeled
graphs <samp>k1</samp> and <samp>k2</samp>, and
<samp>k3</samp> is a subset of <samp>k1</samp>, then
<samp>\Sigma(k3, D)</samp> is consistent with
<samp>k2</samp>. <span class="online">@@TODO: proof</span>
</p>
<p>
One advantage of a strong patch is, then, that one can take a
patch from any true knowledge base change and apply it to a
subset knowledge base, and the result will be true. For
example, if changes to a knowledge base are represented by a
sequence of strong diffs, one can subscribe to the diffs from
any given point on, and acquire a subset of the final
knowledge base.
</p>
<p>
As a practical matter, achieving fully labeled graphs
requires care in building and using the ontology. As a
supplement to the good practice of using URIs to distributing
data, it is useful to identify things indirectly by using
terms with published ontologies that say whether they are
many-many, many-1, 1-many or 1-1. The <a href=
"/2000/10/swap/diff.py">diff.py</a> program from <a href=
"/2000/10/swap/Overview.html">SWAP</a> will generate a strong
diff between two files, provided it can find sufficient
information in the Web to fully label the input graphs.
</p>
<p class="online">
We note in passing that the ontologies we used all involved
inverse functional datatype properties, which are OWL/Full
but not OWL/DL.
</p>
<h2>
Application to Update and Sync
</h2>
<p>
Though we have made small scale tests, we are interested in
pursuing strong diffs, and suspect they will be are useful in
a variety of applications.
</p>
<h3>
Peer-peer update and sync
</h3>
<p>
The algorithm for synchronizing two databases can be
straightforwardly generalized to N. In a decentralized
peer-peer network such as Network News Transfer
Protocol<a href="#NNTP">[NNTP]</a> (or many others), messages
are timestamped and distributed eventually to every party,
though a message may be received by different parties at
different times. When the network is reliable, there may be a
well-defined maximum delivery time.
</p>
<p>
A crude algorithm is to apply the patches in order of the
time-stamp. If a message arrives with a timestamp preceding
the recent ones already taken into account, they are unwound
so that the new version can be built in the proper order. A
patch which fails (as in a CVS conflict) is rejected. In the
case of RDF graphs, failure can be a pre-agreed form of
consistency, such as (for example) OWL-DL consistency. The
sender of the failed patch will realize this as they will be
running the same algorithm on the same patches, and will have
to take recovery action.
</p>
<p>
A new version can be given a version id by hashing the
version id of its predecessor with the message id of the
patch used to make the new from the old. The community can
refer to versions by these ids, and if they want to refer to
a commonly held document, then one only has to wait for the
maximum delivery time to know that everyone in the community
will know the value of the knowledge base for that version.
Even without waiting, anyone who knows of a version with that
ID will know they have the same contents.
</p>
<h3>
Patches as knowledge
</h3>
<p>
The idea of the strong patch file format is interesting
because a patch is a little bit of knowledge. A patch for
example that where my phone number was 1234 it should now be
5678, when in the context in which it is known to be a change
to a valid knowledge base between one week and the next,
indicates that my phone number has actually changed. One
might conclude, say, that I moved or changed jobs. A strong
patch has meaning in itself, and distributing and filtering
these becomes an interesting way of processing knowledge. In
some areas (like houses for sale) it is the new changed
information which is of most interest, and in some areas
(like currency rates) if you listen to a stream of changes
you will in fact accumulate a working knowledge of the area.
</p>
<h3>
Patches as news
</h3>
<p>
From the historical <em>NCSA Mosaic What's New</em> page to
the current syndication of RSS streams <a href=
"#RSS">[RSS]</a>, the interest in news on (or off) the Web
demonstrates that there is great interest in changes to the
status quo. We speculate that this will also be the case on
the Semantic Web. When the state is represented in RDF, then
RDF diffs represent news. The W3C Technical Reports list is
available as RDF, and the W3C RSS feed is partly,
effectively, a list of changes to the Tech Reports list. This
could be formalized by explicitly distributing RDF diffs.
</p>
<h2>
Future directions
</h2>
<p>
The algorithm developed to date produces difference files
only on graphs which are labeled directly with URIs or
indirectly with functional properties or inverse functional
properties.
</p>
<p>
It may be useful to extend the algorithm to cope with graphs
which are not completely labeled, but where the unlabeled
bits are the same in each graph, and so a strong diff can
still be produced. Another avenue would be to look at using
more than one property to label a node when one is not
sufficient.
</p>
<p>
Applications which do not need robustness can use weak
patches. The algorithm could be extended to do more of a
canonicalization-style signature-based match to optionally
give a weak diff where a strong diff cannot be given.
</p>
<p class="online">
In practice, while RDF fundamentally has a graph structure,
the graph is often used to encode ordered lists (RDF
collections). While lists are in fact represented by a
structure of <em>first</em> and <em>rest</em> links within
the graph, when serialized they are normally represented
directly as lists, and within software implementations they
may be stored specially. The representation of changes to
lists may merit a special syntax in the difference file, to
avoid a mess of <em>rdf:first</em> and <em>rest:rest</em>
statements. (@@DanC: first/rest are functional, so I don't
think this case mertis anything special.)
</p>
<p>
RDF does not contain the notion of an unordered set, though
one can with OWL create a class which has an enumerated set
of members. If the use of unordered sets becomes common,
which the authors suspect would be wise in the long run, then
a difference engine should be aware of such sets and be able
to express differences between them.
</p>
<p>
This application, like the rule language, demonstrates the
usefulness of the quoted formulae of n3. The authors believe
that many applications will need this ability to quote RDF
graphs within graphs. As n3 becomes a language of
communication, difference files will of course have to
express changes to nested formulae. As these are graphs, this
is basically a straightforward recursive use of the
difference system for single graphs. A simple though verbose
alternative is to reify the n3 before building differences.
</p>
<p>
With these extensions, the simple difference file format may
lose the elegance of its current simplicity. However, even
with these extensions, most data and ontologies shipped
around the web -- the bottom layers of the semantic web layer
cake -- will be plain RDF graphs and so have simple
difference files.
</p>
<p>
Clearly there are many algorithms which can be imagined for
efficiently generating deltas for RDF graphs. The ones
written are not particularly efficient, having being designed
as proof of concept.
</p>
<h2>
Conclusions
</h2>
<p>
There are many uses for technology of communicating
differences between graphs or changes to a graph. While in
general the generation of differences is basically a graph
isomorphism problem, in a wide set of practical cases, one
can efficiently generate a difference, or patch file.
So-called strong patch files are particularly interesting,
and open up a new series of applications based on the
syndication of change information. However, to be able to
generate them, one needs either a well-labeled graph, which
in turn needs an ontological knowledge of inverse functional
properties to allow nodes to be indirectly labeled. The patch
file format proposed is simple, being a new ontology of only
two (or three) new properties, and directly uses Notation3
syntax and semantics, which itself is a simple extension of
RDF. This format can be generated by all sorts of
difference-finding algorithms. It can be absorbed by any
system capable of matching RDF subgraphs. The patch file
ontology is a candidate for a future standard for remote
update of RDF data.
</p>
<div>
<h2>
References
</h2>
<p class="online">
see <a href="lncs04/Diffbib.bib">Diffbib.bib</a>
</p>
<dl class="bib">
<dt class="misc">
[<a name="RDF04" id="RDF04">RDF04</a>]
</dt>
<dd>
<span class="author">Beckett, D.</span> <cite><a href=
"http://www.w3.org/TR/2004/REC-rdf-syntax-grammar-20040210/">
RDF/XML Syntax Specification (Revised)</a></cite>
<span class="institution">W3C</span> <span class=
"type">Recommendation</span>, 10 <span class=
"month">February</span> <span class="year">2004</span>.
<p class="online">
<a href=
"http://www.w3.org/TR/rdf-syntax-grammar">Latest
version</a> available at
<code>http://www.w3.org/TR/rdf-syntax-grammar</code>
</p>
</dd>
<dt class="misc">
[<a name="DC02" id="DC02">DC02</a>]
</dt>
<dd>
<span class="author">Beckett, D. and Miller, E. and
Brickley, D.</span> <a href=
"http://dublincore.org/documents/2002/07/31/dcmes-xml/"><cite>
Expressing Simple Dublin Core in RDF/XML</cite></a>
<span class="institution">Dublin Core Metadata
Initiative</span> <span class=
"type">Recommendation</span> 31 <span class=
"month">July</span> <span class="year">2002</span>
</dd>
<dt class="misc">
[<a name="RSS" id="RSS">RSS</a>]
</dt>
<dd>
<span class="author">Beged-Dov, Gabe et. al.</span>
<cite><a href="http://web.resource.org/rss/1.0/">RDF Site
Summary (RSS) 1.0</a></cite> 6 <span class=
"month">December</span> <span class="year">2000</span>
</dd>
<dt class="inproceedings">
[<a name="Ber90" id="Ber90">Ber90</a>]
</dt>
<dd>
<span class="author">Berliner, Brian</span> <cite>CVS II:
Parallelizing Software Development</cite> <span class=
"booktitle"><a href="http://www.usenix.org/">USENIX</a>
Conference Proceedings</span> pp <span class=
"pages">341--352</span> <span class=
"month">January</span> 22-26, <span class=
"year">1990</span> <span class="address">Washington,
D.C.</span>
<p class="online">
<a href=
"http://www.hpcc.ecs.soton.ac.uk/hpci/tools/cvs/html/cvs-paper.html">
online copy</a>; <a href=
"http://cvsweb.xfree86.org/cvsweb/cvs/doc/cvs-paper.ms">
ms source</a>
</p>
</dd>
<dt class="misc">
[<a name="Ber03" id="Ber03">Ber03</a>]
</dt>
<dd>
<span class="author">Berners-Lee, Tim and Hawke, Sandro
and Connolly, Dan</span> <cite><a href=
"http://www.w3.org/2000/10/swap/doc/">Semantic Web
Tutorial Using N3</a></cite> <span class=
"howpublished">Twelfth International World Wide Web
Conference</span> <span class="address">Budapest,
Hungary</span> <span class="month">May</span>
<span class="year">2003</span>
</dd>
<dt class="techreport">
[<a name="Car03" id="Car03">Car03</a>]
</dt>
<dd>
<span class="author">Carroll, Jeremy J. and Dickinson,
Ian and Dollin, Chris and Reynolds, Dave and Seaborne,
Andy and Wilkinson, Kevin</span> <cite><a href=
"http://www.hpl.hp.com/techreports/2003/HPL-2003-146.html">
Jena: Implementing the Semantic Web
Recommendations</a></cite> <span class=
"institution">Hewlett-Packard</span> <span class=
"number">HPL-2003-146</span> <span class=
"month">Dec</span> <span class="year">2003</span>
<p class="online">
<a href=
"http://www.hpl.hp.com/semweb/jena.htm">Jena</a>
includes a graph diff program <code>rdfcompare</code>
in the <a href=
"http://jena.sourceforge.net/tools.html">command line
tools</a>.
</p>
</dd>
<dt class="TechReport">
[<a name="Car03s" id="Car03s">Car03s</a>]
</dt>
<dd>
<span class="author">Caroll, Jeremy J.</span>
<cite><a href=
"http://www.hpl.hp.com/techreports/2003/HPL-2003-142.html">
Signing RDF Graphs</a></cite> <span class=
"institution">Hewlett-Packard</span> <span class=
"number">HPL-2003-142</span> <span class=
"month">Jul</span> <span class="year">2003</span>
</dd>
<dt class="Article">
[<a name="Codd70" id="Codd70">Codd70</a>]
</dt>
<dd>
<span class="author">Codd, E. F.</span> <cite><a href=
"http://www.acm.org/classics/nov95/">A Relational Model
of Data for Large Shared Data Banks</a></cite>,
<span class="journal">Communications of the ACM</span>,
Vol. <span class="volume">13</span>, No. <span class=
"number">6</span>, <span class="month">June</span>
<span class="year">1970</span>, pp. <span class=
"pages">377--387</span>.
</dd>
<dt class="Article">
[<a name="Gol03" id="Gol03">Gol03</a>]
</dt>
<dd>
<span class="author">Golbeck, Jennifer and Fragoso,
Gilberto and Hartel, Frank and Hendler, James and Parsia,
Bijan and Oberthaler, Jim</span> <cite><a href=
"http://www.mindswap.org/papers/WebSemantics-NCI.pdf">The
national cancer institute's thesaurus and
ontology</a></cite>. <span class="journal">Journal of Web
Semantics</span>, <span class=
"volume">1</span>(<span class="number">1</span>),
<span class="month">Dec</span> <span class=
"year">2003</span>.
</dd>
<dt class="misc">
[<a name="RDFT04" id="RDFT04">RDFT04</a>]
</dt>
<dd>
<span class="author">Grant, J. and Beckett, D.</span>
<cite><a href=
"http://www.w3.org/TR/2004/REC-rdf-testcases-20040210/">RDF
Test Cases</a></cite>, <span class=
"institution">W3C</span> <span class=
"type">Recommendation</span>, 10 <span class=
"month">February</span> <span class="year">2004</span>.
<p class="online">
<a href="http://www.w3.org/TR/rdf-testcases">Latest
version</a> available at
<tt>http://www.w3.org/TR/rdf-testcases</tt>
</p>
</dd>
<dt class="misc">
[<a name="RDFC04" id="RDFC04">RDFC04</a>]
</dt>
<dd>
<span class="author">Klyne, G. and Carroll, J. J.</span>
<cite><a href=
"http://www.w3.org/TR/2004/REC-rdf-concepts-20040210/">Resource
Description Framework (RDF): Concepts and Abstract
Syntax</a></cite>, <span class="institution">W3C</span>
<span class="type">Recommendation</span>, 10 <span class=
"month">February</span> <span class="year">2004</span>.
<p class="online">
<a href="http://www.w3.org/TR/rdf-concepts/">Latest
version</a> available at
<code>http://www.w3.org/TR/rdf-concepts/</code>
</p>
</dd>
<dt class="TechReport">
[<a name="NNTP" id="NNTP">NNTP</a>]
</dt>
<dd>
<span class="author">Kantor, Brian and Lapsley,
Phil</span> <cite><a href=
"http://www.ietf.org/rfc/rfc977">Network News Transfer
Protocol</a></cite> <span class="institution">IETF</span>
<span class="number">RFC977</span> <span class=
"month">February</span> <span class="year">1986</span>
</dd>
<dt class="misc">
[<a name="Kly04" id="Kly04">Kly04</a>]
</dt>
<dd>
<span class="author">Klyne, Graham</span> <cite><a href=
"http://www.ninebynine.org/RDFNotes/Swish/Intro.html">Semantic
Web Inference Scripting in Haskell</a></cite>
<span class="month">Feb</span> <span class=
"year">2004</span>
<p class="online">
see esp. section <a href=
"http://www.ninebynine.org/RDFNotes/Swish/Intro.html#GraphDiff">
Comparing graphs</a>
</p>
</dd>
<dt class="Book">
[<a name="Kob93" id="Kob93">Kob93</a>]
</dt>
<dd>
<span class="author">Johannes K<span title=
"\"o">ö</span>bler and Uwe Sch<span title=
"\"o">ö</span>ning and Jacobo Tor<span title=
"\'a">á</span>n</span> <cite><a href=
"http://www.birkhauser.com/cgi-win/ISBN/0-8176-3680-3">The
Graph Isomorphism Problem: Its Structural
Complexity</a></cite> <span class="series">Progress in
Theoretical Computer Science</span>. <span class=
"publisher">Birkh<span title=
"\"a">ä</span>user</span>, <span class=
"address">Boston, MA</span>, (<span class=
"year">1993</span>).
<p class="online">
<a href=
"http://www.informatik.hu-berlin.de/Institut/struktur/algorithmenII/Buecher/GI/">
preface, TOC, etc.</a>. cited in <a href=
"http://www.math.tu-berlin.de/~schwartz/papers/KaibelSchwartz2002.references.bib">
KaibelSchwartz2002.references.bib</a>
</p>
</dd>
<dt class="Article">
[<a name="Mill85" id="Mill85">Mill85</a>]
</dt>
<dd>
<span class="author">Miller, Webb and Myers, Eugene
W.</span> <cite>A File Comparison Program</cite>
<span class="journal">Software---Practice and
Experience</span>, <span class=
"volume">15</span>(<span class="number">11</span>), pp.
<span class="pages">1025--1040</span>, <span class=
"month">November</span> <span class="year">1985</span>.
<p class="online">
<a href=
"http://liinwww.ira.uka.de/cgi-bin/bibshow?e=TF0tqf/fyqboefe%7d789658&r=bibtex&mode=intra">
bib</a>
</p>
</dd>
<dt class="Book">
[<a name="Ski97" id="Ski97">Ski97</a>]
</dt>
<dd>
<span class="author">Skiena, Steve</span> <cite>The
Algorithm Design Manual</cite> <span class=
"publisher"><a href="http://www.telospub.com/">Telos
Pr</a></span> <span class="address">New York</span>
<span class="year">1997</span>
</dd>
<dt class="incollection">
[<a name="Ski01" id="Ski01">Ski01</a>]
</dt>
<dd>
<span class="author"><a href=
"http://www.cs.sunysb.edu/~skiena/">Skiena,
Steve</a></span> <span class="chapter">1.5.9</span>
<cite><a href=
"http://www.cs.sunysb.edu/~algorith/files/graph-isomorphism.shtml">
Graph Isomorphism</a></cite> in the <span class=
"booktitle"><a href=
"http://www.cs.sunysb.edu/~algorith/index.html">Stony
Brook Algorithm Repository</a></span> <span class=
"publisher">Stony Brook University</span> <span class=
"year">2001</span>
<p class="online">
with reference to <a href=
"http://www.cs.sunysb.edu/~algorith/implement/gmt/implement.shtml">
GMT - Graph Matching Toolkit</a>
</p>
</dd>
<dt class="Article">
[<a name="Tich85" id="Tich85">Tich85</a>]
</dt>
<dd>
<span class="author">Tichy, W.</span> <a href=
"http://portal.acm.org/citation.cfm?id=4202&dl=ACM&coll=GUIDE">
<cite>RCS--a system for version control</cite></a>
<span class="journal">Software Practice <span class=
"amp">&</span> Experience</span> Volume <span class=
"volume">15</span> , Issue <span class="number">7</span>
(<span class="month">July</span> <span class=
"year">1985</span>) Pages: <span class=
"pages">637--654</span>
</dd>
<dt class="misc">
[<a name="Sync02" id="Sync02">Sync02</a>]
</dt>
<dd>
<cite><a href=
"http://www.openmobilealliance.org/tech/affiliates/syncml/syncmlindex.html">
SyncML Specifications, Version 1.1</a></cite>
<span class="month">Feb</span> <span class=
"year">2002</span> <span class="publisher"><a href=
"http://www.openmobilealliance.org/">Open Mobile Alliance
(OMA)</a></span>
</dd>
<dt class="misc">
[<a name="Wall" id="Wall">Wall</a>]
</dt>
<dd>
<span class="author">Wall, Larry et. al.</span>
<cite><a href=
"http://www.gnu.org/software/patch/patch.html">patch</a></cite>
<span class="publisher">Free Software Foundation</span>
27 <span class="month">Jun</span> <span class=
"year">2000</span>
</dd>
<dt class="misc">
[<a name="EARL" id="EARL">EARL</a>]
</dt>
<dd>
<span class="author">Chisholm, W. and Palmer, S.
B.</span> Editors: <cite><a href=
"http://www.w3.org/TR/2002/WD-EARL10-20021206/">Evaluation
and Report Language (EARL) 1.0</a></cite> <span class=
"institution">W3C</span> <span class="type">Working
Draft</span> (work in progress), 6 <span class=
"month">December</span> <span class="year">2002</span>
<p class="online">
<a href="http://www.w3.org/TR/EARL10/">Latest
version</a> available at http://www.w3.org/TR/EARL10/
</p>
</dd>
<dt class="misc">
[<a name="OWLT" id="OWLT">OWLT</a>]
</dt>
<dd>
<span class="author">Carroll, J. J. and De Roo, J.</span>
Editors: <cite><a href=
"http://www.w3.org/TR/2004/REC-owl-test-20040210/">OWL
Web Ontology Language Test Cases</a></cite> <span class=
"institution">W3C</span> <span class=
"type">Recommendation</span> , 10 <span class=
"month">February</span> <span class="year">2004</span>.
<p class="online">
<a href="http://www.w3.org/TR/owl-test/">Latest
version</a> available at http://www.w3.org/TR/owl-test/
</p>
</dd>
<dt class="misc">
[<a name="OWL" id="OWL">OWL</a>]
</dt>
<dd>
<span class="author">Schreiber, G. and Dean, M.</span>
Editors: <cite><a href=
"http://www.w3.org/TR/2004/REC-owl-ref-20040210/">OWL Web
Ontology Language Reference</a></cite> <span class=
"institution">W3C</span> <span class=
"type">Recommendation</span> , 10 <span class=
"month">February</span> <span class="year">2004</span>.
<p class="online">
<a href="http://www.w3.org/TR/owl-ref/">Latest
version</a> available at http://www.w3.org/TR/owl-ref/
</p>
</dd>
</dl>
</div>
<hr />
<div class="online">
<a href="Overview.html">Up to Design Issues</a>
</div>
</body>
</html>