index.html
151 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html lang="en"><head><META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>XML Binary Characterization Properties</title><style type="text/css">
code { font-family: monospace; }
div.constraint,
div.issue,
div.note,
div.notice { margin-left: 2em; }
ol.enumar { list-style-type: decimal; }
ol.enumla { list-style-type: lower-alpha; }
ol.enumlr { list-style-type: lower-roman; }
ol.enumua { list-style-type: upper-alpha; }
ol.enumur { list-style-type: upper-roman; }
div.exampleInner pre { margin-left: 1em;
margin-top: 0em; margin-bottom: 0em}
div.exampleOuter {border: 4px double gray;
margin: 0em; padding: 0em}
div.exampleInner { background-color: #d5dee3;
border-top-width: 4px;
border-top-style: double;
border-top-color: #d3d3d3;
border-bottom-width: 4px;
border-bottom-style: double;
border-bottom-color: #d3d3d3;
padding: 4px; margin: 0em }
div.exampleWrapper { margin: 4px }
div.exampleHeader { font-weight: bold;
margin: 4px}
</style><link type="text/css" rel="stylesheet" href="http://www.w3.org/StyleSheets/TR/W3C-WG-NOTE.css"></head><body><div class="head"><p><a href="http://www.w3.org/"><img width="72" height="48" alt="W3C" src="http://www.w3.org/Icons/w3c_home"></a></p>
<h1><a id="title" name="title"></a>XML Binary Characterization Properties</h1>
<h2><a id="w3c-doctype" name="w3c-doctype"></a>W3C Working Group Note
31 March 2005</h2><dl><dt>This version:</dt><dd>
<a href="http://www.w3.org/TR/2005/NOTE-xbc-properties-20050331/">http://www.w3.org/TR/2005/NOTE-xbc-properties-20050331/</a>
</dd><dt>Latest version:</dt><dd>
<a href="http://www.w3.org/TR/xbc-properties">http://www.w3.org/TR/xbc-properties</a>
</dd><dt>Previous version:</dt><dd>
<a href="http://www.w3.org/TR/2005/WD-xbc-properties-20050224">http://www.w3.org/TR/2005/WD-xbc-properties-20050224</a>
</dd><dt>Editors:</dt><dd>Mike Cokus, MITRE Corporation</dd><dd>Santiago Pericas-Geertsen, Sun Microsystems</dd></dl><p class="copyright"><a href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> © 2005 <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a><sup>®</sup> (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>, <a href="http://www.ercim.org/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>, <a href="http://www.keio.ac.jp/">Keio</a>), All Rights Reserved. W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a>, <a href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> rules apply.</p></div><hr><div>
<h2><a id="abstract" name="abstract"></a>Abstract</h2><p>This document is the result of a study to identify desirable properties in an XML
format. An XML format is a format that is capable of representing the information in
an XML document. The properties have been derived from requirements induced by use
cases collected in the <a href="#">[XBC Use Cases]</a> document. Properties are
divided into two categories: algorithmic and format. Besides these two categories,
Section <a href="#additional-considerations"><b>6 Additional Considerations</b></a> lists additional considerations
which, because of the difficulty to establish an accurate measurement, have not been
listed as properties but are nonetheless relevant for an accurate comparison between
different proposals. </p></div><div>
<h2><a id="status" name="status"></a>Status of this Document</h2><p><em>This section describes the status of this document at
the time of its publication. Other documents may supersede this
document. A list of current W3C publications and the latest
revision of this technical report can be found in the <a href="http://www.w3.org/TR/">W3C technical reports index</a>
at http://www.w3.org/TR/.</em></p><p>This is a <a href="http://www.w3.org/2004/02/Process-20040205/tr.html#WGNote">Working Group Note</a>, produced by the <a href="http://www.w3.org/XML/Binary/">XML Binary Characterization Working Group</a> as part of the <a href="http://www.w3.org/XML/">XML Activity</a>.</p><p>This document is part of a set of documents
produced according to the Working Group's <a href="http://www.w3.org/2003/09/xmlap/xml-binary-wg-charter.html">charter</a>, in which the Working Group has been determining Use Cases, characterizing the Properties that are
required by those Use Cases, and establishing objective, shared Measurements
to help judge whether XML 1.x and alternate binary encodings provide the
required properties.</p><p>
The XML Binary Characterization Working Group has ended its work.
This document is not expected to become a Recommendation later. It will be
maintained as a WG Note.
</p><p>
Discussion of this document takes place on the public
<a href="mailto:public-xml-binary@w3.org">public-xml-binary@w3.org</a> mailing list (<a href="http://lists.w3.org/Archives/Public/public-xml-binary/">public archives</a>).
</p><p>
Publication as a Working Group Note does not imply endorsement by the W3C
Membership. This is a draft document and may be updated, replaced or
obsoleted by other documents at any time. It is inappropriate to cite
this document as other than work in progress.
</p></div><div class="toc">
<h2><a id="contents" name="contents"></a>Table of Contents</h2><p class="toc">1 <a href="#intro">Introduction</a><br>
2 <a href="#xml-design-goals">Design Goals for XML</a><br>
3 <a href="#syntax-vs-model">Syntax vs. Model</a><br>
4 <a href="#algorithmic-properties">Algorithmic Properties</a><br>
4.1 <a href="#processing-efficiency">Processing Efficiency</a><br>
4.1.1 <a href="#N100E8">Definition</a><br>
4.1.2 <a href="#N100ED">Description</a><br>
4.2 <a href="#small-footprint">Small Footprint</a><br>
4.2.1 <a href="#N10104">Definition</a><br>
4.2.2 <a href="#N10109">Description</a><br>
4.3 <a href="#space-efficiency">Space Efficiency</a><br>
4.3.1 <a href="#N10114">Definition</a><br>
4.3.2 <a href="#N10119">Description</a><br>
5 <a href="#format-properties">Format Properties</a><br>
5.1 <a href="#accelerated-sequential-access">Accelerated Sequential Access</a><br>
5.1.1 <a href="#N1012C">Definition</a><br>
5.1.2 <a href="#N10131">Description</a><br>
5.2 <a href="#compactness">Compactness</a><br>
5.2.1 <a href="#N10141">Definition</a><br>
5.2.2 <a href="#N10146">Description</a><br>
5.3 <a href="#content-type-management">Content Type Management</a><br>
5.3.1 <a href="#N10160">Definition</a><br>
5.3.2 <a href="#N10165">Description</a><br>
5.4 <a href="#deltas">Deltas</a><br>
5.4.1 <a href="#N1018C">Definition</a><br>
5.4.2 <a href="#N10191">Description</a><br>
5.5 <a href="#directly-readable-writable">Directly Readable and Writable</a><br>
5.5.1 <a href="#N101B2">definition</a><br>
5.5.2 <a href="#N101B7">Description</a><br>
5.6 <a href="#efficient-update">Efficient Update</a><br>
5.6.1 <a href="#N101C4">Definition</a><br>
5.6.2 <a href="#N101C9">Description</a><br>
5.7 <a href="#embedding-support">Embedding Support</a><br>
5.7.1 <a href="#N101EB">Definition</a><br>
5.7.2 <a href="#N101F3">Description</a><br>
5.8 <a href="#encryptable">Encryptable</a><br>
5.8.1 <a href="#N10217">Definition</a><br>
5.8.2 <a href="#N1021C">Description</a><br>
5.8.2.1 <a href="#N10221">Partial Encryption</a><br>
5.8.2.2 <a href="#N1022C">Encryption Interoperability</a><br>
5.9 <a href="#explicit-typing">Explicit Typing</a><br>
5.9.1 <a href="#N10237">Definition</a><br>
5.9.2 <a href="#N1023C">Description</a><br>
5.10 <a href="#extension-points">Extension Points</a><br>
5.10.1 <a href="#N1024D">Definition</a><br>
5.10.2 <a href="#N10252">Description</a><br>
5.11 <a href="#format-version-identification">Format Version Identification</a><br>
5.11.1 <a href="#N1025B">Definition</a><br>
5.11.2 <a href="#N10260">Description</a><br>
5.12 <a href="#fragmentable">Fragmentable</a><br>
5.12.1 <a href="#N10269">Definition</a><br>
5.12.2 <a href="#N1026E">Description</a><br>
5.13 <a href="#generality">Generality</a><br>
5.13.1 <a href="#N10284">Definition</a><br>
5.13.2 <a href="#N10289">Description</a><br>
5.14 <a href="#human-language-neutral">Human Language Neutral</a><br>
5.14.1 <a href="#N10294">Definition</a><br>
5.14.2 <a href="#N10299">Description</a><br>
5.15 <a href="#human-readable-editable">Human Readable and Editable</a><br>
5.15.1 <a href="#N102A2">Definition</a><br>
5.15.2 <a href="#N102A7">Description</a><br>
5.16 <a href="#integratable-into-xml-stack">Integratable into XML Stack</a><br>
5.16.1 <a href="#N102D1">Definition</a><br>
5.16.2 <a href="#N102D9">Description</a><br>
5.17 <a href="#localized-changes">Localized Changes</a><br>
5.17.1 <a href="#N102E7">Definition</a><br>
5.17.2 <a href="#N102EF">Description</a><br>
5.18 <a href="#no-arbitrary-limits">No Arbitrary Limits</a><br>
5.18.1 <a href="#N102FF">Definition</a><br>
5.18.2 <a href="#N10304">Description</a><br>
5.19 <a href="#platform-neutrality">Platform Neutrality</a><br>
5.19.1 <a href="#N1030D">Definition</a><br>
5.19.2 <a href="#N10312">Description</a><br>
5.20 <a href="#random-access">Random Access</a><br>
5.20.1 <a href="#N1031B">Definition</a><br>
5.20.2 <a href="#N10320">Description</a><br>
5.21 <a href="#robustness">Robustness</a><br>
5.21.1 <a href="#N1034D">Definition</a><br>
5.21.2 <a href="#N10352">Description</a><br>
5.22 <a href="#roundtrip-support">Roundtrip Support</a><br>
5.22.1 <a href="#N10365">Definition</a><br>
5.22.2 <a href="#N1036C">Description</a><br>
5.23 <a href="#schema-extensions-deviations">Schema Extensions and Deviations</a><br>
5.23.1 <a href="#N10380">Definition</a><br>
5.23.2 <a href="#N10388">Description</a><br>
5.24 <a href="#schema-instance-change-resilience">Schema Instance Change Resilience</a><br>
5.24.1 <a href="#N1039A">Definition</a><br>
5.24.2 <a href="#N1039F">Description</a><br>
5.25 <a href="#self-contained">Self Contained</a><br>
5.25.1 <a href="#N103AF">Definition</a><br>
5.25.2 <a href="#N103B4">Description</a><br>
5.26 <a href="#signable">Signable</a><br>
5.26.1 <a href="#N103FC">Definition</a><br>
5.26.2 <a href="#N10401">Description</a><br>
5.26.2.1 <a href="#N10406">Byte Sequence Preservation</a><br>
5.26.2.2 <a href="#N1040F">Partial Signatures</a><br>
5.26.2.3 <a href="#N1041A">Signature Interoperability</a><br>
5.27 <a href="#specialized-codecs">Specialized codecs</a><br>
5.27.1 <a href="#N10425">Definition</a><br>
5.27.2 <a href="#N1042A">Description</a><br>
5.28 <a href="#streamable">Streamable</a><br>
5.28.1 <a href="#N10439">Definition</a><br>
5.28.2 <a href="#N1043E">Description</a><br>
5.29 <a href="#support-for-error-correction">Support for Error Correction</a><br>
5.29.1 <a href="#N10479">Definition</a><br>
5.29.2 <a href="#N10480">Description</a><br>
5.30 <a href="#transport-independence">Transport Independence</a><br>
5.30.1 <a href="#N10492">Definition</a><br>
5.30.2 <a href="#N10497">Description</a><br>
6 <a href="#additional-considerations">Additional Considerations</a><br>
6.1 <a href="#forward-compatibility">Forward Compatibility</a><br>
6.2 <a href="#implementation-cost">Implementation Cost</a><br>
6.3 <a href="#royalty-free">Royalty Free</a><br>
6.4 <a href="#single-conformance-class">Single Conformance Class</a><br>
6.5 <a href="#widespread-adoption">Widespread Adoption</a><br>
7 <a href="#N104E9">References</a><br>
</p>
<h3><a id="appendices" name="appendices"></a>Appendix</h3><p class="toc">A <a href="#N1058E">Acknowledgments</a><br>
</p></div><hr><div class="body"><div class="div1">
<h2><a id="intro" name="intro"></a>1 Introduction</h2><p>While XML has been enormously successful as a markup language for documents and data,
the overhead associated with generating, parsing, transmitting, storing, or
accessing XML-based data has hindered its employment in some environments. Use cases
describing situations where some characteristics of XML prevent its effective use
are described in another publication of the XBC WG <a href="#">[XBC Use Cases]</a>. </p><p>The question has been raised as to whether some optimization of XML is appropriate to
satisfy the constraints presented by those use cases. In order to address this
question, a compatible means of classifying the requirements posed by the use cases
and the applicable characteristics of XML must be devised. This allows a
characterization of the potential gap between what XML supports and use case
requirements. In addition, it also provides a way to compare use case requirements
to determine the degree to which a common approach to XML optimization could be
beneficial.</p><p>For the purpose of this document, a <em>property</em> is defined as a unique
characteristic of an XML format which affects the format's utility for some
collection of use cases. A consequence of this definition is that a property shall
only be regarded as positive or negative in the context of one or more use cases. In
other words, a collection of use cases is necessary to understand how a property
affects the utility of a format. </p></div><div class="div1">
<h2><a id="xml-design-goals" name="xml-design-goals"></a>2 Design Goals for XML</h2><p>The XML 1.0 recommendation <a href="#">[XML 1.0]</a> outlines a number of design goals
or constraints which resulted in the creation of XML as it is known today. The
design goals were:</p><ol class="enumar"><li><p>XML shall be straightforwardly usable over the Internet.</p></li><li><p>XML shall support a wide variety of applications.</p></li><li><p>XML shall be compatible with SGML.</p></li><li><p>It shall be easy to write programs which process XML documents.</p></li><li><p>The number of optional features in XML is to be kept to the absolute minimum,
ideally zero.</p></li><li><p>XML documents should be human-legible and reasonably clear.</p></li><li><p>The XML design should be prepared quickly.</p></li><li><p>The design of XML shall be formal and concise.</p></li><li><p>XML documents shall be easy to create.</p></li><li><p>Terseness in XML markup is of minimal importance.</p></li></ol><p>Compatibility with SGML <a href="#">[ISO 8879]</a> was a key design goal during the
conception of XML. In fact, XML is regarded as a subset (or profile) of SGML, whose
main purpose was to reduce the inherent complexity of SGML. By reducing the
complexity, e.g., the number of options, XML became much simpler to implement than
SGML, and this in turn resulted in the availability of a myriad of tools and APIs.
It is precisely these tools and APIs (as well as the phenomenal growth of the
Internet) that have attracted a number of different communities to XML. </p></div><div class="div1">
<h2><a id="syntax-vs-model" name="syntax-vs-model"></a>3 Syntax vs. Model</h2><p>The <a href="#">[XML 1.0]</a> defines the XML language using a BNF grammar. Although a
number of data models have been built on top of XML, as a syntactically defined
language, XML is in itself data model agnostic. As stated earlier, an XML format is
a format which is capable of representing the information in an XML document.
Information, however, is in the eye of the beholder; what constitutes information,
as opposed to just data, depends on the data model on which an XML processor is
based.</p><p>The XML infoset was an attempt to establish a separation between data and information
in a way that would suit most common uses of XML. In fact, many of the existing data
models are defined by referring to XML infoset items. However, the XML infoset does
not establish a sound separation between information and data for all applications
of XML. For example, the XML infoset recommendation does not regard the use of
single or double quotes to delimit an attribute value as information, yet there are
applications like XML editors for which this distinction matters.</p><p>The discussion on which is the right data model or what constitutes data versus
information is unlikely to end anytime soon (if ever). Thus, it was the decision of
the XBC WG to leave it out of this document to avoid premature exclusion of
potential uses of XML formats not captured in any of the existing data models. There
are, however, properties such as <a href="#roundtrip-support"><b>5.22 Roundtrip Support</b></a> which can be
used to tighten the relationship between XML and an alternative XML format, but that
do so without diving into the controversial data model discussion.</p></div><div class="div1">
<h2><a id="algorithmic-properties" name="algorithmic-properties"></a>4 Algorithmic Properties</h2><div class="div2">
<h3><a id="processing-efficiency" name="processing-efficiency"></a>4.1 Processing Efficiency</h3><div class="div3">
<h4><a id="N100E8" name="N100E8"></a>4.1.1 Definition</h4><p>This property refers to the speed at which a new format can be generated
and/or consumed for processing with respect to that of XML.</p></div><div class="div3">
<h4><a id="N100ED" name="N100ED"></a>4.1.2 Description</h4><p>There are three broad areas of processing with regard to an XML format:</p><ol class="enumar"><li><p>Serialization: The generation of the format from a data model.</p></li><li><p>Parsing: The reading of the format in order to process and extract
various pieces of information contained in it.</p></li><li><p>Data Binding: The creation of an application data model from the data
contained in the format.</p></li></ol><p>It is sometimes desirable for an XML format to allow for all areas to be
performed in a more efficient manner than it is currently allowed with XML.
For example, it should be possible to serialize a message faster than using
XML. Furthermore, parsing the resulting format should be faster than parsing
XML.</p><p>Processing efficiency should be considered in an end-to-end manner, from
application accessible data on one end to application accessible data on the
other end. In other words, it is desirable to have a process that is
efficient not only in parsing, but in generation, transmission and data
binding. However, not all applications need symmetric speed. Some
applications may require efficient parsing without specific needs as to how
long it takes to generate the format. Other applications may have the
opposite concerns.</p></div></div><div class="div2">
<h3><a id="small-footprint" name="small-footprint"></a>4.2 Small Footprint</h3><div class="div3">
<h4><a id="N10104" name="N10104"></a>4.2.1 Definition</h4><p>This property refers to the size of a processor implementing a new format
with respect to that of a processor implementing XML.</p></div><div class="div3">
<h4><a id="N10109" name="N10109"></a>4.2.2 Description</h4><p>Establishing the exact footprint of an implementation of a format is
impractical due to the number of different programming languages and
platforms that are currently available. However, given the specification of
a format it is possible to determine if the format enables the
implementation of processors whose footprints are smaller than the typical
XML processor for a similar application. This can be accomplished by
considering the number and/or complexity of the features that are required
(which impacts the size of the code segment) and the amount of data that
must be available to a processor in order to support the format (which
impacts the size of the initialized data segment). </p><p>Perhaps the best example is that of XML versus SGML. By simply inspecting the
corresponding specifications it is possible to estimate, given the reduced
number of options and features, the footprint of a typical XML processor to
be smaller than that of an SGML processor. (In fact, many experts in the
field view this property of XML 1.x as one of the key reasons of its
success, so it is only natural to consider it when evaluating alternate
formats).</p></div></div><div class="div2">
<h3><a id="space-efficiency" name="space-efficiency"></a>4.3 Space Efficiency</h3><div class="div3">
<h4><a id="N10114" name="N10114"></a>4.3.1 Definition</h4><p>This property refers to the memory requirements of a processor implementing a
new format with respect to that of a processor implementing XML.</p></div><div class="div3">
<h4><a id="N10119" name="N10119"></a>4.3.2 Description</h4><p>A format should be processable in a wide variety of platforms. Small devices
such as mobile handsets, for instance, have limited amount of memory compare
to those of desktop PCs or servers. The amount of <em>dynamic</em>
memory that a format requires in order to process an instance may hinder its
application in certain platforms. XML is currently supported in devices that
have much less memory than a PC. Thus, it is imperative for an alternate
format to enable the implementation of processors whose memory requirements
are smaller (or at least not higher) than the typical XML processor.</p><p>In many cases, space efficiency is inversely proportional to processing
efficiency. I.e., the desired level of space efficiency is often achieved by
increasing the processing time. Therefore, it is desirable for a format to
enable the implementation of processors whose space efficiency could be
configured based on the available memory. For example, if the format is
processed on a high-end server, a processor should support maximum
processing efficiency at the expense of memory efficiency. On the other
hand, if the format is processed on a low-end mobile handset, a processor
should support maximum space efficiency at the expense of processing
efficiency.</p></div></div></div><div class="div1">
<h2><a id="format-properties" name="format-properties"></a>5 Format Properties</h2><div class="div2">
<h3><a id="accelerated-sequential-access" name="accelerated-sequential-access"></a>5.1 Accelerated Sequential Access</h3><div class="div3">
<h4><a id="N1012C" name="N1012C"></a>5.1.1 Definition</h4><p>Accelerated sequential access is the ability to sequentially stream through
an XML file when searching for data model items more rapidly than the
average seek time using character-by-character comparison. </p></div><div class="div3">
<h4><a id="N10131" name="N10131"></a>5.1.2 Description</h4><p>Accelerated Sequential Access is similar to <a href="#random-access"><b>5.20 Random Access</b></a> in
its overall objective of reducing the amount of time needed to access data
model items, but differs in the method used to accelerate the access. In
random access, lookup is performed in constant time through the use of a
table, while in accelerated sequential access data model items are searched
in streaming mode, resulting in a lookup time that is related (yet not
necessarily proportional) to the number of data model items in the document.</p><p>One approach to supporting this property is through the inclusion in the XML
document of an index which allows skipping over content as the document is
read. For example, an element index might contain the offset to the start
tag of the next peer element. If the application recognizes that a desired
data model item is not in the current element, or in the children of the
current element, it will be able to restart the match at the offset of the
next peer element without inspection of all data model items contained in
the current element. A format that enables faster matching via the
conversion of strings to tokens can also be considered as supporting the
Accelerated Sequential Access property.</p><p>Performance of accelerated sequential access is measured by the time
(algorithmic complexity) required to find data model items, the time needed
to construct any indexes and special structures, and also the size of those
indexes and special structures (impacting memory consumption and bandwidth
utilization in the transport). Most implementations will support
modification of the XML document; the cost of updating the indexes or
special structures is another performance factor which can be measured.</p></div></div><div class="div2">
<h3><a id="compactness" name="compactness"></a>5.2 Compactness</h3><div class="div3">
<h4><a id="N10141" name="N10141"></a>5.2.1 Definition</h4><p>Compactness refers to the size of the in-memory or otherwise stored
representation of an XML format. Compactness is achieved by ensuring that a
format includes as little extraneous information as possible. Extraneous
information is any information that is not needed in order to process the
format completely and properly.</p></div><div class="div3">
<h4><a id="N10146" name="N10146"></a>5.2.2 Description</h4><p>A compact encoding can be achieved in different ways: a number of different
techniques such as lossy/loss-less, schema-based/non-schema-based,
delta-based/non-delta-based, among others, have been considered. For
example, JPEG files are an example of a lossy encoding where bits of the
original document are thrown away (and cannot be recovered) in order to
achieve a compact representation. The same type of lossy encoding could be
employed for XML documents in order to achieve compactness.</p><p>Alternatively, differing degrees of compactness can be achieved with a
loss-less encoding, whereby redundant information is removed. In this manner
no information is lost, however, compactness is achieved through the removal
of this redundant information. A loss-less encoding would typically be less
compact than a lossy encoding. </p><p>Furthermore, a schema-based encoding of an XML document can achieve a degree
of compactness by using prior knowledge about the structure and content of a
document. A format is schema-based if it uses information from the
document's schema to achieve a better degree of compactness. This
information could be used later as the document is processed or
reconstituted. It is worth pointing out that although not self contained, a
schema-based encoding is not inherently lossy given that, in principle, a
decoder can reproduce the data model using both the encoding and the schema.
Thus, as with other techniques, a schema-based encoding can be lossy or
loss-less.</p><p>Another mechanism to achieve compactness is through a delta-based encoding.
Delta-based encodings are generated by comparing an original document with
a secondary, reference document. The resulting document is the delta between
the original and the reference document. This type of encoding can be lossy
or loss-less. In either case, the original document can be reconstituted by
using both the delta and the reference document.</p><p>The advantages of a compact representation are: </p><ol class="enumar"><li><p>Storage: Large XML documents can be stored in the compact format,
thus saving space.</p></li><li><p>Transmission: Large XML documents can be transmitted more efficiently
when represented in a more compact form, thus saving time. This is
especially important when sending XML over low-bandwidth
connections.</p></li></ol><p>A disadvantage of any compact encoding might be the additional time and CPU
required to generate the encoding.</p></div></div><div class="div2">
<h3><a id="content-type-management" name="content-type-management"></a>5.3 Content Type Management</h3><div class="div3">
<h4><a id="N10160" name="N10160"></a>5.3.1 Definition</h4><p>A format integrates into the media type and encoding infrastructure if it
defines one or more media types and/or encodings for itself as well as the
way in which they should be used.</p></div><div class="div3">
<h4><a id="N10165" name="N10165"></a>5.3.2 Description</h4><p>The media type and encoding infrastructure provides for a common and simple
way of identifying the contents of a document and the content coding with
which it is transmitted. It is fundamental to the functioning of the Web and
enables powerful features such as content negotiation. While required for
the Web, these mechanisms are not specific to it and are typically reused in
many other situations.</p><p>It is therefore desirable that formats meant to be used on the Web define
(and preferably register) the media type and/or encoding that one is to use
when transmitting them. </p><p>There are multiple ways in which an alternate XML format could define how
media types and encodings are to be used with it. Several options of note
and their associated trade-offs are: </p><ul><li><p>The alternate XML serialization is considered to just be a content
coding. In this case it <em>may</em> have a media type (as gzip
does with 'application/gzip' in addition to the 'gzip' content
coding) but the principal way of using it is to keep the original
media type of the XML content and only change the content coding.
The upside of this approach is that the existing content dispatching
system is untouched, that the media type information is fully
useful, and that the content coding infrastructure is put to good
use. The downside is that there is philosophical and technical
dissent as to whether an alternate XML serialization is an encoding
in the way that gzip is —a discussion that needs to
involve considerations concerning the <a href="#roundtrip-support"><b>5.22 Roundtrip Support</b></a>, <a href="#directly-readable-writable"><b>5.5 Directly Readable and Writable</b></a>, and <a href="#integratable-into-xml-stack"><b>5.16 Integratable into XML Stack</b></a> properties. With this
approach content negotiation is fully possible. The behaviour of
fragment identifiers does not need to be re-specified.</p></li><li><p>The alternate XML format is not a mere content coding but requires
the definition of one or more media types. This case subdivides into
two options:</p><ul><li><p>There is only the alternate XML format's media type. Any
content sent using that format must have that media type.
The upside of this approach is that it is simple. The
downside is that you lose all media type information of the
original XML content so that you must then define another
system to provide that information, or define new media
types for all possible content (application/binxhtml,
image/binsvg, etc.). With this content negotiation is
entirely impossible (or rather, totally useless) unless new
media types are defined for all things XML. The behaviour of
fragment identifiers becomes impossible to specify, or has
to be re-specified for all the new media types.</p></li><li><p>A new media type suffix is defined in the manner that it was
done for XML content (e.g., "+bix") to be used for all
content expressed using the alternate XML serialization. The
upside of this approach is that it's simple and that the
diversity of media types is maintained. The downside is that
it requires much more intrusive modifications to systems
that rely on existing media types. With this content
negotiation is possible, but with lesser power. The
behaviour of fragment identifiers has to be re-specified to
map back to the one in +xml types.</p></li></ul></li></ul></div></div><div class="div2">
<h3><a id="deltas" name="deltas"></a>5.4 Deltas</h3><div class="div3">
<h4><a id="N1018C" name="N1018C"></a>5.4.1 Definition</h4><p>A delta is a representation of arbitrary changes from a particular instance
of a base, parent document which, along with that parent document, can be
used to represent the new state of the parent. The parent is identified in a
globally or locally unique manner. A delta is distinct from a fragment or a
computed difference, although the latter could be represented as a
delta.</p></div><div class="div3">
<h4><a id="N10191" name="N10191"></a>5.4.2 Description</h4><p>A common need is to convey changes of a potentially large existing object
with minimal processing and data representation. Overall compression of an
object can help minimize data transmitted, but there are always size and
change combinations where this is of minimal use when replication is needed.
A delta is similar to a fragment in that it contains a subset of an overall
object. The primary difference is that a fragment is a source independent,
contiguous range of data. A delta is an efficient record of one or more
changes to the original document. A delta captures changes to the parent
efficiently, represents the changes efficiently, and is efficiently usable
by a receiver along with the original, parent document. The receiver uses
the combination of the original parent document and one or more deltas to
operate as if the receiver had a single object that was the end result of
all changes. One operating mode that has convenient characteristics is to
append deltas to copies of the parent object, either in memory or as a file.
Some existing document interchange formats make use of appended delta
instances, the Adobe Portable Document Format (PDF) being the most notable
<a href="#pdf-ref">[PDF Reference]</a>.</p><p>An important use for deltas is to factor out redundancy in a way that is
related to schema-based redundancy removal. The concept of a delta has been
used in similar ways in the past, an example of which is the minimization of
later packets in SLIP/PPP protocols by referring to prior packets. The
simple equivalent to a parent and its delta is a complete copy of the before
and after instances. This avoids losing any data, but the knowledge of what
has changed must be created by comparison. A delta instance can be created
by a similar differences process, but this is a high complexity operation.
An example of the differencing approach is RFC3229 and the experimentally
registered VCDIFF algorithm.</p><p>A delta may be more efficient if nearby changes are localized in the format.
A delta may be the mechanism used to localize changes. </p><p>Some examples where deltas are required or useful are:</p><ul><li><p>Efficient and repeated replication of large objects and their changes
among distributed nodes.</p></li><li><p>Transaction logging of changes to objects, allowing for rollback or
replay.</p></li><li><p>Efficient representation of messages of all sizes in an application
protocol or other communication by reusing redundancy in structure,
invariants, or common values.</p></li><li><p>Rapid and efficient creation of new instances of an object based on a
template which may be large or otherwise should be shared.</p></li></ul><p>There are at least two major ways that a delta-like facility has been
created. These could be referred to as high-level change operations and
low-level change tracking. Efficiency, granularity, and time complexity of
these methods will vary. XML 1.x does not include explicit support for
deltas. </p></div></div><div class="div2">
<h3><a id="directly-readable-writable" name="directly-readable-writable"></a>5.5 Directly Readable and Writable</h3><div class="div3">
<h4><a id="N101B2" name="N101B2"></a>5.5.1 definition</h4><p>A format is directly readable and writable if it can be serialized from an
instance of a data model and parsed into an instance of a data model without
first being transformed to an intermediate representation.</p></div><div class="div3">
<h4><a id="N101B7" name="N101B7"></a>5.5.2 Description</h4><p>Formats that are directly readable and writable generally make more efficient
use of available memory and processor resources that those that are not. In
addition, they sometimes have better streaming characteristics.</p><p>The parser for a directly readable format can parse the format into an
instance of the data model in one logical step. Likewise, the serializer for
a directly writable format can serialize an instance of the data model in
one logical step. In contrast, a parser for a format that is not directly
readable must transform the original format into the intermediate format
before it parses the intermediate format into an instance of the data model.
Likewise, the serializer for a format that is not directly writable must
serialize an instance of the data model into the intermediate format before
it transforms the intermediate format to the target format. Unless the order
and organization of items in the intermediate format correspond closely to
order and organization of corresponding items in the target format, the
required transformations will negatively impact streaming. </p><p>An example of a format that is not directly readable and writable is a
gzipped XML stream. To create an instance of a data model from a gzipped XML
stream, the stream must be decompressed to XML format, then parsed into the
data model. Likewise, to create a gzipped XML stream from an instance of a
data model, the data model must be serialized to XML format, then
compressed. The compression and decompression steps require additional
processor and memory resources above and beyond that required to parse and
serialize the XML format. In addition, the two step process limits
streaming. </p></div></div><div class="div2">
<h3><a id="efficient-update" name="efficient-update"></a>5.6 Efficient Update</h3><div class="div3">
<h4><a id="N101C4" name="N101C4"></a>5.6.1 Definition</h4><p>Efficient Update refers to the property of being able to efficiently apply
changes to a part of a format instance. This property is important for
applications that require the modification/insertion/deletion of specific
data model items in a way more efficient than a complete
deserialize/modify/serialize cycle.</p></div><div class="div3">
<h4><a id="N101C9" name="N101C9"></a>5.6.2 Description</h4><p>The least efficient case requires the deserialization of the data, prior to
the application of the changes, followed by the serialization of the data,
once the data has been inserted, modified, or deleted. The most efficient
case would apply changes directly on the serialized data, thus avoiding the
need to cross data representation boundaries. A format could have
characteristics that allow it to be modified efficiently, in place, without
being completely rebuilt and potentially without moving substantial amounts
of data. This is direct support for Efficient Update. XML can be modified
in-place, but data after the modification must be moved. XML + gzip cannot
be modified in place at all. As a existence proof, a serialized DOM where
nodes are allocated in a file in a malloc-like way and each object allocated
uses file-relative pointers, would completely support efficient update.</p><p>If direct update is not possible or efficient, efficient support for the
<a href="#deltas"><b>5.4 Deltas</b></a> property would allow an application to use the
original instance along with one or more deltas to serve as an efficient
update mechanism. While the production of a low-level delta for some formats
is cheap and the use of a stack of low-level deltas can be relatively cheap,
this method requires an ever growing stack of changes and indirection
layers. This can be inefficient in certain scenarios.</p><p>This property is concerned with the ability of a format to be modified
without being rebuilt. The <a href="#deltas"><b>5.4 Deltas</b></a> or <a href="#fragmentable"><b>5.12 Fragmentable</b></a> properties, with how application semantics would be
used to actually represent changes from one instance to the next.</p><p>There are three aspects under which this property should be evaluated:</p><ol class="enumar"><li><p>Efficiency of update: This is the time and complexity required to
apply the changes, starting from the original serialization up until
the updated serialization is produced.</p></li><li><p>Efficiency of retrieval: This is the time required to retrieve a
(possibly) modified value.</p></li><li><p>Compactness: This is the additional space required for the
application of an update or the typical overhead of supporting
different kinds of changes to a format instance. In the existence
proof example, inserting a new element might be efficient because it
might just result in an append to the file while inserting
characters in a large text might cause a new chunk to be allocated
at the end of the file and the old chunk to become an unused block.
While the block could be reused just like with malloc, mitigating
the cost, it is still a potential inefficiency.</p></li></ol></div></div><div class="div2">
<h3><a id="embedding-support" name="embedding-support"></a>5.7 Embedding Support</h3><div class="div3">
<h4><a id="N101EB" name="N101EB"></a>5.7.1 Definition</h4><p>A format supports <em>embedding</em> to the extent to which it provides
for the interchange and management of embedded files of arbitrary
format.</p></div><div class="div3">
<h4><a id="N101F3" name="N101F3"></a>5.7.2 Description</h4><p>A variety of use cases call for the inclusion of files of one type inside
another: images, video, and sound embedded within multimedia documents;
arbitrary files inside Web service messages; large datasets bundled with
metadata. File formats vary in their support for this use.</p><p>Formats designed for narrowly constrained purposes, such as GIF, typically
make no provision for embedding. While it may be possible to encode some
additional data in certain metadata fields within such formats, doing so
violates the spirit of the file format and requires tight agreement between
the sender and receiver for interchange. Such formats effectively offer no
interchange or management support and are not considered to support
embedding.</p><p>Other formats, such as XML and TIFF, permit embedding simply by virtue of
flexibility: they do nothing to prevent file embedding. However, because
these formats have no mechanism for distinguishing an embedded file from
other types of data, tight agreement is still required between the sender
and the receiver for interchange. Such mechanisms are also not easily
manageable.</p><p>XML falls somewhere between these first two cases. It is flexible enough to
allow the embedding of files, but only if those files consist entirely of
character data. Embedding binary data requires an additional agreement as to
how it is encoded as character data, e.g., via base64 encoding. This also
imposes a penalty on both compactness and processing speed. </p><p>Other formats, such as XSL-FO and PDF, provide specific embedding points. For
example, XSL-FO defines the instream-foreign-object element for embedding
objects which are in a non-XSL-FO namespace. By establishing a general
mechanism, they make embedded data interchangeable and manageable because
there is an a priori agreement for creating and identifying embedded files. </p><p>Finally, there are packaging formats, such as MIME multipart/related and ZIP,
which exist solely for the purpose of containing embedded files. Packaging
formats generally provide significant management capabilities by supporting
metadata, signatures, encryption, and compression of embedded files. They
are typically designed specifically for the interchange of these embedded
files. </p><p>Evaluation of a format for embedding support should take into account both
interchange and manageability, as described here, as well as support for
related properties like <a href="#compactness"><b>5.2 Compactness</b></a>, <a href="#random-access"><b>5.20 Random Access</b></a>, <a href="#signable"><b>5.26 Signable</b></a>, <a href="#encryptable"><b>5.8 Encryptable</b></a> and <a href="#streamable"><b>5.28 Streamable</b></a>. </p></div></div><div class="div2">
<h3><a id="encryptable" name="encryptable"></a>5.8 Encryptable</h3><div class="div3">
<h4><a id="N10217" name="N10217"></a>5.8.1 Definition</h4><p>A format is encryptable to the extent to which it makes the encryption and
decryption of a file straightforward and interoperable.</p></div><div class="div3">
<h4><a id="N1021C" name="N1021C"></a>5.8.2 Description</h4><p>In principle any file format is encryptable in that the bytes which compose
any file may be fed to an encryption algorithm. Encryption capabilities,
however, are most useful when the encryptor and decryptor can agree upon
which algorithm was used and which portions of the file are encrypted.
Formats vary in how amenable they are to specifying and maintaining this
information, and this in turn can be a measure of how "encryptable" they
are. </p><div class="div4">
<h5><a id="N10221" name="N10221"></a>5.8.2.1 Partial Encryption</h5><p>It is often desirable to encrypt only a portion of a file. In the most
basic use of this capability a file may contain unencrypted data
regarding the encryption algorithm and parameters used for the remainder
of the file. This can promote interoperability, as described below.</p><p>In other situations it is desirable to leave certain metadata (e.g., SOAP
headers or XMP packets) unencrypted but encrypted the remainder of the
document in order to permit certain routing or query functions to be
performed by intermediaries. In the case of compound documents it is
sometimes desirable to leave the metadata of each embedded document
unencrypted while encrypted the remainder of the document.</p><p>Other things being equal, formats which place all bytes representing the
encoding of data model constructs (such as SOAP headers) in a contiguous
byte range better support partial encryption because those ranges are
more easily determined and specified. We say such formats are "more
encryptable". Formats which permit such ranges to be created but do not
guarantee them are less encryptable because the application must either
determine all ranges which must be encrypted or arrange for that
information to be placed in a contiguous byte range.</p><p>Finally, there are formats which will never place data model constructs
in contiguous ranges but scatter that information into tables and other
mechanisms used to achieve compactness or other format properties. For
example, a format may place element names in a vocabulary index table.
That table may contain names of some elements in the encrypted region
and others which are not; one must then determine how much of the table
to encrypt. Such formats are least encrypted with respect to partial
encryption.</p></div><div class="div4">
<h5><a id="N1022C" name="N1022C"></a>5.8.2.2 Encryption Interoperability</h5><p>Encryptors must be able to communicate to decryptors which portions of a
file are encrypted and by what mechanism. Other things being equal,
formats which make no provisions for recording this information are less
encryptable because they require additional agreement between the
parties involved in order to make encryption interoperate.</p><p>Formats may provide syntax for encoding this information in the file
format itself. Such formats are more encryptable because interoperable
encryption support can be created simply by reference to the format
itself; no additional agreements with decryptors are required.</p></div></div></div><div class="div2">
<h3><a id="explicit-typing" name="explicit-typing"></a>5.9 Explicit Typing</h3><div class="div3">
<h4><a id="N10237" name="N10237"></a>5.9.1 Definition</h4><p>Explicit typing is a property of an XML format in which datatype information
of data model items is intrinsically a part of the format.</p></div><div class="div3">
<h4><a id="N1023C" name="N1023C"></a>5.9.2 Description</h4><p>Datatype information is used to constrain and type validate XML input and to
enable interpretation of data model items in the document as a specified
type.</p><p>In XML 1.x, datatype information is not an intrinsic part of the format.
Common usage is to express datatype information in a separate document such
as an XML Schema. XML applications that require datatype information would
interpret the associated schema document in order to identify the datatype
of specific data model items. Type information may also be conveyed in XML
through the use of some additional markup in the content of the XML
document; for example, by adding an attribute such as "xsi:type" with a type
value to the element for which type information is needed. </p><p>The use of a separate document for datatype information, requiring processing
a schema and mapping the schema against the data model items in the document
instance, may cause degradation of performance. Infrastructure issues such
as how to obtain the schemas and how to assure that they are the correct
versions, are difficult problems to solve and have no standardized
solutions. Putting datatype markup in the document instance may address some
of the problems with using external documents if all types are built-in but
is not an XML language-independent, reliable, nor standardized method. If
types are user-defined the schema is actually still required for
interpretation of the type. Conveying type information through markup may
also be an inefficient method for encoding type information in the document.</p><p>Explicit typing as an intrinsic part of an XML format can be schema-dependent
or schema-independent. A scheme for schema-dependent explicit typing might
put type information into the instance but might still rely on a schema for
interpretation. A schema-independent scheme for explicit typing would be
fully self-contained and therefore enable XML applications to perform type
validation and data binding without the overhead of schema processing, but
with the requirement that the type system used is universally understood and
with the limitation that the type system is not extensible. Schema-dependent
explicit typing can still offer some of the advantages of schema-independent
explicit typing in that processing of instances without the schema may be
possible if the set of types used are all universally understood, or if it
is possible to perform useful processing without knowing the schema-based
definition of extended types. This achieves partial self-containment.
Another possibility is to embed schema information for extended types in the
instance itself, making this approach schema-dependent but still fully
self-contained. </p><p>It is possible to represent type information much more compactly as part of
the format than it is through the use XML markup in the document instance.
Explicit typing offers a way to include type information which has the
advantages of being XML language-independent, reliable and standard. An
explicit typing scheme is required in schema-less formats that represent
primitive datatypes natively. For example, a schema-independent XML format
which represented 32-bit floating point numbers as a 32-bit sequence of bits
would need to tell the parser that sequence of bits should be interpreted as
a floating point number. </p></div></div><div class="div2">
<h3><a id="extension-points" name="extension-points"></a>5.10 Extension Points</h3><div class="div3">
<h4><a id="N1024D" name="N1024D"></a>5.10.1 Definition</h4><p>An extension point is a method for easily extending a format and its
implementation.</p></div><div class="div3">
<h4><a id="N10252" name="N10252"></a>5.10.2 Description</h4><p>The extension might be for a new, alternate, or experimental version of the
format, an implementation of a layered format, or an application specific or
otherwise proprietary extension. Formats may support extension points in
various ways and to different degrees. Some formats do not allow extension
points in any predefined way. Some formats allow for a single extension
point. Other formats allow multiple extension points but may restrict what
items may be extended, such as adding just new attribute types or character
encoding. The most flexible formats support extension points in all
important ways and on arbitrary data items. An important consideration
relative to concerns about interoperability, evolution, and debugging is
whether features can be represented by standard convention in XML. Some
features that extend existing data models may be part of the initial version
of a format while other potential additions may be implemented as extension
points. An example would be the addition of item metadata such as typing or
encoding. Another example would be the addition of new tokens for support
of a new kind of data model item.</p></div></div><div class="div2">
<h3><a id="format-version-identification" name="format-version-identification"></a>5.11 Format Version Identification</h3><div class="div3">
<h4><a id="N1025B" name="N1025B"></a>5.11.1 Definition</h4><p>This property refers to the ability to efficiently determine the version of a
format from a document instance.</p></div><div class="div3">
<h4><a id="N10260" name="N10260"></a>5.11.2 Description</h4><p>It is considered a best practice to reliably and efficiently identify
versions of a format. XML 1.x supports this notion as part of the optional
document's prolog (if absent, XML version 1.0 is assumed). It is desirable
to access this information as early as possible, so a format that does not
make this information available when the processing starts should be
considered inefficient as far as this property is concerned.</p></div></div><div class="div2">
<h3><a id="fragmentable" name="fragmentable"></a>5.12 Fragmentable</h3><div class="div3">
<h4><a id="N10269" name="N10269"></a>5.12.1 Definition</h4><p>A format is said to be fragmentable when it supports the ability to encode
instances that do not represent the entirety of a document together with
sufficient context for the decoder to process them in a minimally meaningful
way.</p></div><div class="div3">
<h4><a id="N1026E" name="N1026E"></a>5.12.2 Description</h4><p>While typical usage of XML involves exchanging entire documents (the special
case of external parsed entities notwithstanding), it is sometimes desirable
to support the ability to exchange smaller, independently exploitable parts
of a document. The presence of this property largely facilitates a variety
of other properties of a format and processing tasks that may be performed
on top of a document such as the transmission of deltas, error resilience
mechanisms, improved access times, or the prioritized transmission of
document parts.</p><p>This property is similar to <a href="#streamable"><b>5.28 Streamable</b></a> in that processors
featuring these properties are able to deal with small parts of a document,
but it is different in that the fragments can be treated independently and
in arbitrary orders —unlike <a href="#streamable"><b>5.28 Streamable</b></a> where
atomic items are processed in document order. This difference incurs
additional requirements to support the transmission of the context required
to process a fragment (at the very least the set of in-scope namespaces, and
possibly also the values of xml:base, xml:space, and xml:lang). Several
standard efforts within the W3C refer to the ability to fragment XML
documents, notably XQuery Update and XML Fragment Interchange <a href="#xml-fragment-interchange">[XML Fragment Interchange]</a>. </p><p>In addition to the ability to process fragments in isolation, it is possible
to consider storing one or more parts of a document instance as immediately
extractable fragments, so that they can be pulled out with little or no
additional processing cost. For example, by supporting localized versions of
any table tokenization, namespaces, or other redundancy reduction measures
that may have been employed in the document. This ability to encode
self-contained subtrees is also useful to facilitate document
composition.</p></div></div><div class="div2">
<h3><a id="generality" name="generality"></a>5.13 Generality</h3><div class="div3">
<h4><a id="N10284" name="N10284"></a>5.13.1 Definition</h4><p>A format has the property of generality if it is competitive with
alternatives across a diverse range of XML documents, applications and use
cases.</p></div><div class="div3">
<h4><a id="N10289" name="N10289"></a>5.13.2 Description</h4><p>To be successful as a global standard, a format must be valuable for a wide
range of XML documents, applications and use cases. XML documents vary in
size from tens of bytes to tens of gigabytes. They vary in structure from
highly structured data to semi-structured and loosely structured documents.
Some XML applications require strict schema validity, while others deal more
flexibly with schemas or do not use schemas at all. Some XML applications
require preservation of insignificant whitespace, comments, processing
instructions, etc., while others ignore these items or actually prohibit
some of them. Some binary XML use cases need to optimize for compactness at
the expense of speed, some need to optimize for speed at the expense of
compactness and some require a balance between these two extremes. </p><p>Formats that are competitive with alternative solutions across a wide range
of XML documents, applications and use cases are more general and more
likely to succeed as a global standard. On the other hand, formats that are
optimized for specific data, applications or use cases at the expense of
others are more specialized and are not likely to succeed as a global
standard.</p></div></div><div class="div2">
<h3><a id="human-language-neutral" name="human-language-neutral"></a>5.14 Human Language Neutral</h3><div class="div3">
<h4><a id="N10294" name="N10294"></a>5.14.1 Definition</h4><p>A format is human language neutral if it is not significantly more optimal
for processing when its content is in a given language or set thereof, and
does not impose restrictions on the languages or combinations of languages
that may be used with it.</p></div><div class="div3">
<h4><a id="N10299" name="N10299"></a>5.14.2 Description</h4><p>Historically, it has often been a property of many data and document formats
that they only supported a small subset of existing human languages (often
due to supporting a limited legacy character encoding), and therefore were
unusable in a large set of situations. More recent formats such as XML do
not suffer from similar limitations. While it is impossible for a format to
perform identically, in terms of compactness or processing efficiency, for a
language that can be entirely captured using a single byte per character and
for one that requires a multi-byte encoding, not favouring one over the
other ensures better internationalization support, resiliency to the passing
of time, and makes wider adoption possible.</p></div></div><div class="div2">
<h3><a id="human-readable-editable" name="human-readable-editable"></a>5.15 Human Readable and Editable</h3><div class="div3">
<h4><a id="N102A2" name="N102A2"></a>5.15.1 Definition</h4><p>A format is humanly readable and editable to the extent to which a person can
understand and modify it without either a specification of the format or
implementations of that specification. For example, many persons are capable
of reading and editing XML files without having read the XML specification
and without the assistance of XML-specific software applications.</p></div><div class="div3">
<h4><a id="N102A7" name="N102A7"></a>5.15.2 Description</h4><p>Situations often arise in system development, testing, and troubleshooting in
which it is convenient to be able to create, examine or repair data using a
limited tool set. I.e., a text editor is likely available, but a
format-specific parser may not be. Similarly, the data may conform to some
specification but that specification may not be readily available.</p><p>For example, HTML is a humanly readable and editable format. As such, many
HTML files have been created or updated merely by reading other HTML files
and using no more than basic text editors. The mechanism by which the
humanly readable and editable quality of HTML aided its rapid adoption is
sometimes called the "view source" effect, in reference to the web browser
menu items which permits the underlying source HTML of any page to be
viewed.</p><p>A similar situation arises in archiving applications. For short-term
archiving, on the order of a few decades, it is generally reasonable to
assume that format specifications and implementations are available. But for
long-term archiving, on the order of centuries or longer, it is best to
assume that the format specification and implementations will be lost or
unusable and the documents will be deciphered by people.</p><p>Whether for short-term web page authoring or long-term archiving, this
property is fundamentally about how easy it is to understand and modify the
data in a file when little about that file is known. To that end, a format
satisfying this property should attempt to make the <em>first guess</em>
of someone attempting to read or modify the file correct. Specifically:</p><ul><li><p>Use a regular and explicit structure, such as element tags defined by
XML.</p></li><li><p>Use natural language as text and avoid the use of <em>magic</em>
binary values or compression.</p><p>(A magic value is one which is assigned arbitrarily and whose meaning
cannot be derived except by reference to some external authority,
such as a file format specification. For example, if a format used
the number '73' to indicate the use of UTF-8 and '29' to indicate
UTF-16 then it uses a magic number, as there is no decipherable
logic to these assignments. Compare this to a format which instead
uses the strings 'utf-8' and 'utf-16'.)</p></li><li><p>Use a limited number encoding mechanisms, and prefer those that are
most obvious. For example, use RGB triples to encode color, not both
RGB and CMYK.</p></li><li><p>Be self-contained, as external information (such as a referenced
schema) may not be available. (c.f., <a href="#self-contained"><b>5.25 Self Contained</b></a>)</p></li><li><p>Maintain the same order and position of information items as it has
in the data model being represented. For example, keep element names
inline as in XML, and not consolidated into a token dictionary
located elsewhere in the file, elements should be serialized in the
same order as they appear in the model instead of being stored in,
say, alphabetical order.</p></li></ul></div></div><div class="div2">
<h3><a id="integratable-into-xml-stack" name="integratable-into-xml-stack"></a>5.16 Integratable into XML Stack</h3><div class="div3">
<h4><a id="N102D1" name="N102D1"></a>5.16.1 Definition</h4><p>XML as a data format is surrounded by a large body of specifications that
provide additional features (validation, transformation, querying, APIs,
canonicalization, signatures, encryption, rendering, etc.) considered to
form the <em>XML Stack</em>. A format is said to integrate well into the
XML Stack if it can easily find its place into the large body of XML-related
technologies, with minimal effort in defining new or modified
specifications.</p></div><div class="div3">
<h4><a id="N102D9" name="N102D9"></a>5.16.2 Description</h4><p>One of the great powers of XML is that whenever a technology is added to its
core set (the <em>XML Stack</em>) it becomes instantly available to the
others, thereby leading to better-than-linear increments in the quality and
usefulness of the system. The value of a new format can therefore be in part
measured by how well it integrates into the XML Stack so as to be able to
reuse as much as possible of the existing functionality.</p><p>It should be noted that a vital factor in making this orthogonality in
specifications possible has been the syntax-based nature of XML that enables
a loose coupling between various systems that may then be based on a data
model reused from another specification, on a data model of their own, or
directly on the XML 1.x syntax.</p></div></div><div class="div2">
<h3><a id="localized-changes" name="localized-changes"></a>5.17 Localized Changes</h3><div class="div3">
<h4><a id="N102E7" name="N102E7"></a>5.17.1 Definition</h4><p>A format exhibits localized changes if a single change to an information item
in a data model instance produces a corresponding change in the format that
is limited to a single small range of bytes. A format also exhibits
localized changes if multiple nearby data model changes cause small nearby
changes in the format. This property refers to changes in a new complete
instance relative to an instance that exists prior to changes. This is
distinct from but related to the <a href="#deltas"><b>5.4 Deltas</b></a> property which
represents the ability to produce an instance that consists only of the
changes relative to the original instance. In a delta, the changes are
naturally packed closely together. Some use cases that require localized
changes may be served by the ability to work with deltas.</p></div><div class="div3">
<h4><a id="N102EF" name="N102EF"></a>5.17.2 Description</h4><p>It is possible to create a format where a minor difference between two
logical data model instances causes widespread differences in the byte
representation of the resulting format instances. It is also possible to
create formats where most differences, which could be considered changes,
would be more localized. The default definition of "nearby logical data
model changes" would be the depth-first nature of XML. Other interpretations
are possible, such as changes to siblings versus deep children.</p><p>This property measures whether changes in the data model being represented
are reflected with relatively small and similarly coherent changes in the
format. An individual change should produce a small contiguous byte range
change while a group of data model changes should produce format instance
changes that are relatively similar in distance. The <a href="#deltas"><b>5.4 Deltas</b></a>
property provides for dense format instances that localize changes that are
arbitrarily far apart. Some applications may benefit from localized changes
but be as well or better served by deltas, making this property more
important if deltas are not supported.</p><p>As an example, XML supports localized changes well while gzipped XML does
not. Gzip, being a streaming dictionary-based compressor, will tend to
produce a different byte stream after the point of a difference in
input.</p></div></div><div class="div2">
<h3><a id="no-arbitrary-limits" name="no-arbitrary-limits"></a>5.18 No Arbitrary Limits</h3><div class="div3">
<h4><a id="N102FF" name="N102FF"></a>5.18.1 Definition</h4><p>The No Arbitrary Limits property refers to the degree to which a format
imposes limits on quantities such as sizes, lengths, maximum number of
character encodings, etc. Formats normally establish these limits by
allocating a maximum number of bits for the storage of those quantities.</p></div><div class="div3">
<h4><a id="N10304" name="N10304"></a>5.18.2 Description</h4><p>Arbitrary limits can cause discontinuities in format evolution which
translate into application difficulties for unforeseen uses. A format should
try to avoid imposing limits on quantities such as string lengths, tables
sizes, etc. to the extent to which those decisions may result in
incompatibility problems with subsequent revisions of the format designed to
address advancements in technology or new uses. It is worth noting that in
many cases there is a tradeoff between flexibility and ease of use as well
as between flexibility and performance.</p></div></div><div class="div2">
<h3><a id="platform-neutrality" name="platform-neutrality"></a>5.19 Platform Neutrality</h3><div class="div3">
<h4><a id="N1030D" name="N1030D"></a>5.19.1 Definition</h4><p>Platform neutrality is the property of formats that are not significantly
more optimal for processing on some computing platforms or architectures
than on others.</p></div><div class="div3">
<h4><a id="N10312" name="N10312"></a>5.19.2 Description</h4><p>It is naturally impossible for a format to perform identically on all
computer platforms and architectures, but in many cases it is possible to
optimize a format for processing on a given platform to the detriment of
several others. Some platform neutral formats may have set endiannesses or
word-lengths, but these have been chosen to correspond to the format's needs
and not to match a given platform's specificities, e.g., by being defined
around the native structures of a given programming language. Platform
neutrality ensures not only that wide adoption is possible, but also makes
the format more resilient to the passing of time. In some cases, options in
the format may be used based on the preferred parameters of the systems
involved.</p></div></div><div class="div2">
<h3><a id="random-access" name="random-access"></a>5.20 Random Access</h3><div class="div3">
<h4><a id="N1031B" name="N1031B"></a>5.20.1 Definition</h4><p>Random access is the ability to lookup the location of data model items in a
document, as opposed to searching for data items by sequential traversal of
XML events or XML structures. Although the objective of Random Access is to
reduce the amount of time (algorithmic complexity) needed to access data
model items, the property is characterized by its support for a random
access method to the data.</p></div><div class="div3">
<h4><a id="N10320" name="N10320"></a>5.20.2 Description</h4><p>Even though the random access method is intended to reduce lookup time, a
cost will be associated with the structures which support the lookup. The
cost in terms of processing time will be at least the time required for a
sequential scan of an input document as well as the additional costs in
terms of storage, memory consumption, and bandwidth utilization. The random
access property will be of interest in those use cases where that cost is
less significant than the access cost to the document without random access.</p><p>Many of the implementation characteristics of random access in XML are
related to the ability to build a table for efficient access to data items
in the document. The following example illustrates a simple form of a random
access table. The byte position and length are obtained for each data item
in the document using a token scanner; this information is later stored in a
table and associated with the document. Given a scheme for looking up a data
item in the table, an application could directly read any data item in a
memory-mapped byte array (using the byte position and length) or it could
perform a sequential byte traversal over the byte stream.</p><p>The access table may contain information other than the position and length
of the data model item. Some additional information will be indispensable in
almost all cases since these two properties will only support look-up of
data items by position. Examples of other information associated with each
entry in the access table include its kind, its schema-described data type,
as well as other schema-described properties such as optionality and
namespace scope (alternately, schema-aware documents could simply reference
schema information which is available in the schema itself stored outside of
the document). This additional information may be more or less useful to
specific types of applications; as stated before, the cost of construction,
access and size of the table will be balanced against the potential benefit
to specific types of applications.</p><p>The access table may be complete, selective, on-demand, or heuristic. A
complete access table would provide addressing information for all data
items in the document. A selective access table would provide addressing
information for some of the data items in the document. For example, a
random access system can provide addressing information only for elements.
An on-demand access table can provide addressing information for specific
data items that have been requested by the application. Finally, a heuristic
random access system can provide addressing information for data items based
on some criteria such as prior experience with related documents.</p><p>The type of access table has bearing on the time needed to construct it, its
size, and the time needed to obtain addressing information for a data item.
These constraints are the trade-offs made against the level of
addressability into the document as offered by the table.</p><p>The actual construction of the access table and the format of the lookup
results is largely an implementation decision but some design features will
better support different approaches to enabling random access to data model
items efficiently. For example, a selective implementation of random access
to data items which supports lookup by element name might be best
implemented as a hash table. An on-demand implementation that uses a
sequence of XPath statements to specify data items of interest may return an
array of nodesets whose order is determined by the input sequence.</p><p>Another characteristic of data items lookup is whether it is guaranteed to
return a unique data item or if multiple data items may match a lookup and,
in this case, whether multiple, iterative, or a single selected item will be
returned. This characteristic will again have a bearing on performance
issues and trade-offs may reasonably be made according to the needs of the
application.</p><p>In allowing direct access into a document, the random access method can,
potentially, have some impact on the binary format's <a href="#fragmentable"><b>5.12 Fragmentable</b></a> property. A format which supports both random
access and fragmentability would have to have a mechanism for providing the
context of a data item. The context should include active namespaces and,
possibly, hierarchical context such as enclosing and ancestor elements. This
context information also has a bearing on the support of schema-awareness
allowed through the random access method (namespaces are needed to resolve
the schema; hierarchical context may be needed to determine the relevant
content model). The context is naturally preserved when the index or access
table is absolutely independent, from the access point of view, from the
infrastructure which supports fragmentation. This design is most feasible
when the content of the document is relatively static and primary accessed
for reading. In a more dynamic situation, the random access method and
fragmentation infrastructure may need to work together to achieve the
desired goal. In this case, the inclusion of context information may impact
either or both the speed of random access and the storage required by the
access table. A format supporting random access should specify whether it
supports full, partial, or no fragmentability.</p><p>A fundamental distinction in the implementation of the Random Access property
is whether the format contains the access table or it is simply guaranteed
not be incompatible with the implementation of random access methods. In the
first case the access table is bound to the XML document instance, enabling
it to be transported and accessed by different processing nodes while
retaining all information necessary for random access. In this case, a
standard will have to explicitly specify the format of the access table. In
the second case, transportability of random access information embedded in
the document is not required but the ability to build an access table
external to the document (or multiple documents) is. For example, in a
persistent store (e.g., a database), an index can be built over all
documents in the store but without the additional storage overhead
associated with including an access table in each document. Conditions
incompatible with the random access property of a binary XML format in this
scenario may include such things as the impossibility to construct an index
due to, for example, a compression scheme that does not store data
sequentially, or the prohibitive processing time requirements for the
construction of an index dictated by the complexity associated with the
format.</p><p>It is also possible for these two approaches to coexist; the document may,
optionally, contain an access table and also guarantee efficient
indexability. A consequence of allowing the embedded access table to be
optional is that the document must be complete and capable of being
correctly processed without the access table. It must also be possible to
construct and add the embedded access table from an XML document with does
not contain the access table. Some mechanism must exist to enable
applications to identify whether the access table is present or not, and to
negotiate the format when documents are interchanged. The notion could be
extended to support different types of access tables appropriate to
different application scenarios.</p><p>Two operations are associated with random access: random read (extract) and
random update. The random read operation extracts information from an XML
document. This information can contain one or several values and/or one or
several data model items (subtrees). The random update operation allows data
items in the XML document to be inserted, deleted or replaced using random
access addressing mechanisms to locate the position in the document where
the update is to be made.</p><p>There are multiple implementation techniques for the random update operation.
They can either update documents directly by modifying the document
representation itself or do it indirectly by storing new parts of document
in separate tables. In any case, for interoperability, random update must
allow the updated document to be written out to an XML 1.x representation
with the updates in place.</p><p>Any write process presents challenges with respect to synchronization of
updates when updates may come simultaneously from multiple threads or
processes. A random access update implementation either automatically or
optionally synchronizes updates or assumes the user takes responsibility for
synchronizing data requests.</p><p>Other characteristics of random update include whether it can enforce XML
well-formedness and/or schema compliance of updated data, and whether such
enforcement takes place each time an update is made, or on request, or when
the XML is written out.</p><p>The Random Access property is tightly associated with the efficiency of read
and update operations. Generally speaking, it is possible to build access
tables for an XML document in its textual representation. However, this
would require a processor to maintain all structural information (references
to each item and property) as well as all data type information (for
schema-aware documents) outside of the document itself. Such information,
very often, takes significantly more space than the document itself;
additionally, most of the random access operations require too many reads
and writes in this case.</p><p>A format that includes embedded random access indexing abilities may support
a further variation that could be called a <em>stable virtual
pointer</em> that is valuable for several uses and needed by certain use
cases. A stable virtual pointer is an index entry that is created through
any manner that results in an absolute location in the data model instance.
This index entry has a permanent ID relative to the file and has the
property that any update to the data model will result in the virtual
pointer still pointing to the same location. Location in this case means the
same element, character, or position relative to other items in the data
model instance. Characters, elements, attributes, and other objects can be
inserted, updated, or removed and the virtual pointer continues to point to
the same relative position. The format might support references to this
stable virtual pointer. Internal references might be specially tagged and
should have low complexity of usage for an application. A stable virtual
pointer allows efficient construction, manipulation, and usage of data
models other than tree structured ones, and also allows stable references
both internally and externally which would be complex and costly to maintain
otherwise.</p></div></div><div class="div2">
<h3><a id="robustness" name="robustness"></a>5.21 Robustness</h3><div class="div3">
<h4><a id="N1034D" name="N1034D"></a>5.21.1 Definition</h4><p>A format is said to be robust if it allows processors to insulate erroneous
sections in a way that does not affect the ability to process the remaining
sections.</p></div><div class="div3">
<h4><a id="N10352" name="N10352"></a>5.21.2 Description</h4><p>Processors exhibiting robustness minimize the size of the sections affected
by errors (within some limited boundary of the error positions) and are able
to unequivocally interpret the remaining sections without compromising the
partial format integrity. From an application's point of view, the result is
a partially complete data model instance with sporadic information losses
caused by the errors.</p><p>The robustness property of a format can often be translated into (i) the
processor's ability to detect errors if there are any and (ii) the format's
facility to permit skipping over to the position where the processor can
resume document processing. To support the processor's ability to detect
errors, dedicated redundancy such as a cyclic redundancy check (CRC) may be
added to the format in the so-called channel coding algorithms.</p><p>There are applications that have certain constraints that do not permit or
afford the time of data re-transmission and are required to do their best in
an attempt to recover as much information as possible out of the document
even in the face of errors.</p><p>One such application is found in multimedia broadcasting systems that
broadcast data to wireless devices. Broadcasted media has to be resilient
from possible errors and continuously allow processing on the devices even
with sporadic corruption caused by transmission errors.</p><p>Another example is business document exchange over unreliable networks.
Documents may consist of data of different levels of significance. Some
portion of the document may be absolutely crucial for performing the
business while others may be merely informative. It is often the case that
businesses can move forward as soon as the crucial data is processed even if
errors are found in the informative parts.</p><p>The property of robustness is primarily concerned with bit-stream level
errors and is generally agnostic of such types of errors that as are found
at the document level such as grammatical or semantic errors. It does not
cover the behaviour of lax parsers such as most HTML browsers that rely on
complex and non-standard heuristics in order to continue processing in the
face of severe errors so as to interpret authorial intent and provide bug
for bug compatibility with previous implementations. Rather, robustness
corresponds to draconian processing that may be compartmentalized to
fragments of a document.</p></div></div><div class="div2">
<h3><a id="roundtrip-support" name="roundtrip-support"></a>5.22 Roundtrip Support</h3><div class="div3">
<h4><a id="N10365" name="N10365"></a>5.22.1 Definition</h4><p>A format supports roundtripping if converting a file from XML to that format
and back produces an output equivalent to the original input.</p><p>A format supports roundtripping via XML if converting a file from that format
to XML and back produces an output equivalent to the original input.</p></div><div class="div3">
<h4><a id="N1036C" name="N1036C"></a>5.22.2 Description</h4><p>In the course of processing, a file may be converted between XML and an
alternate representation, or vice versa one, or more times. Roundtrip
support measures the degree to which the original input and the final output
of this process are equivalent, assuming no other changes to the file.</p><p>A format may support roundtripping to various degrees. Exact equivalence
means that an exact copy of the input can be produced from the intermediate
format. Lossless equivalence means that an XML instance or fragment can be
output and verified which is identical to the input XML instance or fragment
in all aspects which are significant and can only differ in those aspects
which are not significant.</p><p>A format generally supports lossless equivalence if it directly supports the
same data models as XML, as this means each element of the data model
instance has a representation both in that format and in XML. Otherwise, a
file may contain an element with no representation and which is therefore
lost during conversion.</p><p>For example, if a format supported nested attributes on attributes, this
information would have no direct equivalence when converting to XML's
single-level attributes. This is not to say that such nested attributes
could not be encoded in XML according to some encoding scheme, but that any
such encoding scheme would not qualify as direct support for the data model.
Support for round tripping is defined to require direct data model support.</p><p>
<b>Relationship to Canonicalization:</b> Equivalence verification is
necessary to prove that round-tripping has been successful. If a format
supports exact equivalence then verification is trivial. Otherwise, the data
must first be converted to a canonical form. XML 1.x canonicalization is an
expensive operation. Short of support for exact equivalence, a format which
supports a more efficient canonicalization algorithm is preferable to one
that does not.</p></div></div><div class="div2">
<h3><a id="schema-extensions-deviations" name="schema-extensions-deviations"></a>5.23 Schema Extensions and Deviations</h3><div class="div3">
<h4><a id="N10380" name="N10380"></a>5.23.1 Definition</h4><p>Support for schema extensions and deviations is the ability to represent
information items that were either not defined in the schema associated with
the input document or do not conform exactly to the associated schema
definitions. The phrase <em>open content</em> has been used to refer to
one form of schema extension, wherein an instance document is permitted to
include elements and attributes beyond those defined by the schema. All
non-schema-based formats exhibit this property. A format that prohibits
applications from intentionally encoding information sets that do not
conform to the given schema does not exhibit the schema extensions and
deviations property.</p></div><div class="div3">
<h4><a id="N10388" name="N10388"></a>5.23.2 Description</h4><p>In the pursuit of more space efficient encodings of data, one strategy is the
use a schema of some kind to inform the encoder and decoder. There are
several different ways that this information can be used to minimize data in
an instance. Some strategies result in removing some or all self-describing
structure, type information, and identifiers in a way that often makes
evolution of the schema, encoding applications, and decoding applications
rigid and difficult. Other strategies allow more flexibility and more loose
coupling. Self-contained, fully self-describing, methods exhibit this
property completely.</p><p>This property illustrates the tradeoff between space efficiency and
flexibility. Flexibility can include mismatch in versions between schemas or
encoder/decoder versions, parts of a logical schema that evolve
independently in a collection of applications or partners, and the general
ability to evolve gracefully with loose coupling. Schema-informed strategies
are one of several types of methods for minimizing data size. Other methods
generally provide full support for arbitrary instances but may have other
tradeoffs. While more than one method may be employed in a format, the
primary method can be seen as a major decision in a format.</p><p>An important distinction exists between possible solutions that require
explicit prior indication of what elements may be extended and those
solutions that allow extension anywhere. With some format designs, this
indication is by schema metadata. These formats would not satisfy the schema
extensions and deviations property because these definitions are effectively
part of the schema. Schema extensions and deviations may or may not be
encoded relative to another schema. This property is different from <a href="#extension-points"><b>5.10 Extension Points</b></a> in that it refers to dynamic extensions to a
schema or the case in which no schema is used at all.</p><p>From some perspectives, the ability to handle arbitrary instances is
potentially not part of what would be called "valid XML" because it is not
described by a schema or not described in detail. Other perspectives include
environments, application architectures, and development situations where
infrastructure requiring schemas for validation and/or encoding is onerous
or impossible to use. The ultimate authority for validity of a data model
instance lies with the developer and application that works with the data
object. Particular schema specifications represent typical validation needs
factored out into a common language and potentially for data-driven
validation engines. These will never be complete or sufficient for every
case and may be difficult to fit to some needs. In some cases, the need for
flexibility and other solution aspects contraindicates schemas and formats
that require any use of schemas.</p></div></div><div class="div2">
<h3><a id="schema-instance-change-resilience" name="schema-instance-change-resilience"></a>5.24 Schema Instance Change Resilience</h3><div class="div3">
<h4><a id="N1039A" name="N1039A"></a>5.24.1 Definition</h4><p>A schema instance change resilience format is one in which areas of interest
that do not change or, with less flexibility, are changed only in restricted
ways, such as additions, are still valid to receivers expecting earlier or
later versions even after other areas have changed. An area of interest may
be defined as a subtree, an XPath, or other data model items.</p></div><div class="div3">
<h4><a id="N1039F" name="N1039F"></a>5.24.2 Description</h4><p>It is very common for a data format instance definition to change over time.
A format supports limited Schema Instance Change Resilience if only the
schema or related metadata needs to be distributed. A format is fully
flexible if any change is supported and less so if restricted, to additions
for example. Full support means that changes are only needed when directly
affected by modifications, such as removing a data model item. </p><p>This property is related to <a href="#self-contained"><b>5.25 Self Contained</b></a> and may be
fulfilled with or without self containment. A non-self-contained solution
might rely on loadable schema meta-data or delta parent instances.</p><p>There are three categories of serializations with respect to Schema Instance
Change Resilience: (i) requires schema-related updates for any changes (ii)
does not require schema-related updates for certain changes such as addition
and (iii) does not require schema-related updates.</p></div></div><div class="div2">
<h3><a id="self-contained" name="self-contained"></a>5.25 Self Contained</h3><div class="div3">
<h4><a id="N103AF" name="N103AF"></a>5.25.1 Definition</h4><p>An XML format is self-contained if the only information that is required to
reproduce the data model instance is (i) the representation of the data
model instance and (ii) the specification of the XML format.</p></div><div class="div3">
<h4><a id="N103B4" name="N103B4"></a>5.25.2 Description</h4><p>For applications in which the receiver is unable to request or receive
additional information, it is important that the document instances are self
contained. This is desirable in applications in which it would be difficult,
impractical, or costly to access additional resources.</p><p>An example of such application is an archiving application where there is a
significant time lag between generation and consumption of the data. To
ensure that the receiver is capable of reproducing the data model instance
from the archived format, the format must be self contained. Accordingly, no
additional information is required which might no longer be available at the
time of consumption.</p><p>Another example would be an infrastructure consisting of intermediary
applications that are placed between senders and receivers without their
prior knowledge. For instance, XML firewalls and load balancing applications
must efficiently inspect the content of document instances in order to make
decisions. A format that is <em>always self contained</em> is helpful in
such situations. However, applications may still be able to operate if they
can be pre-configured with the additional knowledge external to the format
definition that is required. In addition, formats that provide optional
support for this property can still be used if this feature is negotiated
during the initial handshake.</p><p>Schema-based formats allow for the efficient representation of data models
instances due to knowledge of the structure and datatypes defined in the
schema. Thus, a schema-based format requires the receiver to have access to
the schema definitions for the decoding process to be successful. A
schema-based format is considered <em>not self contained</em> unless the
schema information is also stored as part of the format (an optional feature
to include the schema would be an example of an optionally self-contained
format). The partial use of schema information would lead to several
partially self-contained formats.</p><p></p><p>The following example illustrates the general concept of schema-based versus
non-schema-based encodings. Consider the following simple XML fragment that
might be used to describe a pixel of data: </p><div class="exampleInner"><pre><red>10</red>
<green>20</green>
<blue>30</blue></pre></div><p>One way of describing this fragment in binary would simply be to serialize
the three values as binary bytes:</p><div class="exampleInner"><pre>10,20,30</pre></div><p>The receiver would need to know in advance what each of these bytes meant.
This could be done my means of a simple schema:</p><div class="exampleInner"><pre> byte 1 = red
byte 2 = green
byte 3 = blue</pre></div><p>This would be fine until something changed on either side of the link. For
example, one side might now define a new color component (alpha):</p><div class="exampleInner"><pre> byte 1 = red
byte 2 = green
byte 3 = blue
byte 4 = alpha</pre></div><p>If the other side was not immediately made aware of this change (a very
common real-world occurrence), it would be expecting three bytes, but would
receive four, and would therefore not know what to do with the last byte.
Most likely, it would be ignored.</p><p>A more serious issue would arise if alpha was inserted at a different
position then the end:</p><div class="exampleInner"><pre> byte 1 = red
byte 2 = alpha
byte 3 = green
byte 4 = blue</pre></div><p>In this case, the processor that was not aware of the change would likely
interpret the newly added byte in the wrong way. </p><p>The format is self-contained if everything that the receiver needs to know to
decode the contents is included. The XML instance at the beginning of the
example illustrates this. So does the binary case if the schema is included
(most likely in an optimized binary form) with the message contents:</p><div class="exampleInner"><pre> byte 1 = red
byte 2 = alpha
byte 3 = green
byte 4 = blue
10,40,20,30</pre></div><p>By moving some information from the schema to the message format, improved
resilience to change can be achieved. For example, we could define a simple
type system for the color values and add these identifiers to the message
instance: </p><div class="exampleInner"><pre> Schema:
'1' = <red>, 1 byte
'2' = <green>, 1 byte
'3' = <blue>, 1 byte
Message instance:
'1',10,'2',20,'3',30</pre></div><p>This allows for two things that could not have been done before:</p><ul><li><p>The elements can be transmitted in any order;</p></li><li><p>If an additional element is added at one transfer end that the other
end does not know about, it can take an appropriate action. For
example, if '4' was added as the type code for alpha in the previous
example, the processor without this information could simply discard
an '4' element because it did not know what to do with it and still
function normally as it was before.</p></li></ul><p>Additional tradeoffs can be made by adding length information to allow
resilience to content length changes and additional type information using
well-known type codes to allow a message instance to be interpreted to some
degree without the need for a schema.</p></div></div><div class="div2">
<h3><a id="signable" name="signable"></a>5.26 Signable</h3><div class="div3">
<h4><a id="N103FC" name="N103FC"></a>5.26.1 Definition</h4><p>A format is signable to the extent to which it makes the creation and
validation of digital signatures both straightforward and interoperable.</p></div><div class="div3">
<h4><a id="N10401" name="N10401"></a>5.26.2 Description</h4><p>In principle any file format is signable, in that the bytes which compose any
file may be fed to a digital signature algorithm. Signatures, however, are
only useful when they can be both created and verified, and verification
requires that the verifier be able to determine precisely which bytes were
signed, by whom, and how. Formats vary in how amenable they are to
specifying and maintaining this information, and this in turn can be a
measure of how "signable" they are.</p><div class="div4">
<h5><a id="N10406" name="N10406"></a>5.26.2.1 Byte Sequence Preservation</h5><p>Other things being equal, file formats which define a one-to-one
relationship between each possible data model instance and the
serialization of that instance are more easily signed because they
require that processors maintain the byte sequences exactly. A text file
can be said to operate this way, in that a change to any byte of the
file results in a different text file. </p><p>Other formats define a one-to-many relationship between the data model
instance and possible serializations. Such formats permit (or even
invite) processors to modify the byte sequences. XML is such a format;
for example, a processor could replace each character entity in an XML
document with a numeric entity reference and have encoded the same
information but with a significantly different byte sequence. The
ability to sign XML then requires the development of a canonicalization
algorithm which defines a particular serialization of each data model
instance that is used for signature purposes. </p><p>Finally, a format which has a one-to-many relationship between data model
instances and serializations but also defines a canonical serialization
might be considered as falling in between the two extremes; signing and
verifying is more work than if there is only one serialization but it
saves the effort of developing the canonical format itself.</p></div><div class="div4">
<h5><a id="N1040F" name="N1040F"></a>5.26.2.2 Partial Signatures</h5><p>It is often desirable to sign only a portion of a file, such as in
electronic document use cases in which multiple signatures are attached
to a single document. This capability is about both which portion of the
file is not signed, and therefore modifications to which will not break
a signature, as well as which portion is signed. In such use cases, the
signed portion is determined at the semantic (i.e., schema) level in the
data model. For example, a signature may be applied to page one of a
multi-page document but not to any other pages.</p><p>It is critical that such signatures are calculated over all portions of
the file which encode information relevant to the semantic data model
construct; otherwise, portions not included may be modified and the
signature is insecure. For example, consider an XML document in which a
sub-tree uses a namespace prefix. If the prefix declaration is outside
the sub-tree and therefore not covered by the signature then the
declaration can be altered —thus changing the meaning of the
signed portion— without breaking the signature.</p><p>Other things being equal, then, formats which place all bytes
representing the encoding of semantic data model constructs in a
contiguous byte range are more signable because those ranges are more
easily determined and specified. Formats which permit such ranges to be
created but do not guarantee them are less signable because the
application must either determine all ranges which must be signed or
arrange for that information to be placed in a self-contained sub-tree
contiguous byte range.</p><p>Finally, there are formats which will never place semantic constructs in
contiguous ranges but scatter that information into tables and other
mechanisms used to achieve compactness or other format properties. For
example, a format may place element names in a vocabulary index table.
That table may contain names of some elements in the signed region and
others which are not; one must then determine how much of the table to
sign and how to permit subsequent modifications to only break the
signature when necessary. Such formats are least signable with respect
to partial signatures.</p></div><div class="div4">
<h5><a id="N1041A" name="N1041A"></a>5.26.2.3 Signature Interoperability</h5><p>Signers must be able to communicate to signature verifiers which bytes
were signed, by whom, and how. Other things being equal, formats which
make no provisions for recording this information are less signable
because they require additional agreement between the parties involved
in order to make signatures interoperate.</p><p>Other formats may provide syntax for encoding this information in the
file format itself. Such formats are more signable because interoperable
signatures can be created simply by reference to the format itself; no
additional agreements with verifiers are required.</p></div></div></div><div class="div2">
<h3><a id="specialized-codecs" name="specialized-codecs"></a>5.27 Specialized codecs</h3><div class="div3">
<h4><a id="N10425" name="N10425"></a>5.27.1 Definition</h4><p>This is a property of formats that are able to associate processor extensions
(known as plugins or codecs) with specific parts of a document in order to
encode and decode them more optimally than the processor's default approach
would.</p></div><div class="div3">
<h4><a id="N1042A" name="N1042A"></a>5.27.2 Description</h4><p>Some specific vocabularies contain data structures that can benefit from
special treatment, typically in order to obtain higher compression ratios or
much faster processing. However, it would naturally be highly impractical to
include all such specific cases in the default format specification so that
all processors would have to implement them, while only a small subset of
users would exercise the functionality.</p><p>Therefore, a format may include the ability to reference predefined
extensions to the processor (both encoder and decoder) that are tailored to
a specific need and can therefore encode certain parts of the document in an
optimized manner. This requires the format to be able to flag segments as
encoded with additional software, and the processors to be able to read
these segment via the use of extensions.</p><p>Though the presence of this property makes the format in general more
suitable to a larger set of uses and less likely to include very specialized
features of use to only a small fragment of the format's user base, it also
carries a high cost in terms of interoperability as it requires all
participants involved in the exchange to support the additional software
that knows how to decode the specific extension.</p><p>While in practice there are subtleties as to the ways in which this property
can be supported, it is more accurately measured as a boolean indicating
whether a format supports it or not.</p></div></div><div class="div2">
<h3><a id="streamable" name="streamable"></a>5.28 Streamable</h3><div class="div3">
<h4><a id="N10439" name="N10439"></a>5.28.1 Definition</h4><p>For a format, streamability is a property of the processor (serializer,
parser) that converts a data model instance into the format and vice versa.
A processor is streamable if it is able to generate correct partial output
from partial input. A format is streamable if it is possible to implement
streamable processors for it. (Note: in some industries this property is
referred to as incremental processing.)</p></div><div class="div3">
<h4><a id="N1043E" name="N1043E"></a>5.28.2 Description</h4><p>Streamability is needed in memory-constrained environments where it is
important to be able to handle data as it is generated to avoid buffering of
data inside the processor. It is crucial when a document is generated
piecemeal by some outside process, possibly with indefinitely long breaks
between consecutive parts. Examples of the former requirement are provided
by the <a href="http://www.w3.org/TR/xbc-use-cases/#ws-for-small-devices">Web Services for Small Devices</a> and the <a href="http://www.w3.org/TR/xbc-use-cases/#xml-docs-mobile">Multimedia XML
Documents for Mobile Handsets</a> for use cases. Examples of the
latter requirement are provided by the <a href="http://www.w3.org/TR/xbc-use-cases/#metadata">Metadata in
Broadcast Systems</a> and the <a href="http://www.w3.org/TR/xbc-use-cases/#xmpp">XMPP Instant Messaging
Compression Use Cases</a>.</p><p>A precise definition can be derived by assuming that the data model consists
of atomic components, which are assembled into documents in some structured
manner. The serialization of a document expressed in the data model is then
simply a traversal of its atomic components in some defined order with each
applicable component being translated to the output stream. Output
streamability is the ability to create a correct initial sequence of the
output stream from a partial traversal. Input streamability is the ability
to create the corresponding partial traversal from such an initial sequence
so that the application can process the results of this partial traversal as
if it were traversing the complete document.</p><p>Streamability is also characterized by the amount of buffering that needs to
be done in the processors. Buffer space is measured in the number of items
in the input. For a serializer, the atomic components of the data model; for
a parser, the elements (e.g., bytes) of the stream. A requirement for
streamability is that both processors be implementable such that they only
require constant buffer space, no matter what the input document is or how
it is mapped to the data model.</p><p>Another important consideration not captured by the above is the need for
lookahead in the parser. If the parser is required to look ahead in the
input stream to determine where the atom currently being read ends, and it
is possible that the lookahead is not available (e.g., due to the serializer
concurrently streaming the output), streamability is lost.</p><p>Examples of non-streamable formats can be had by considering subsequences of
the atomic component traversal during serialization. For some types of
sequences it can be beneficial to have the length of the full serialized
form of the sequence precede the actual sequence, so the serializer must
buffer the whole sequence before outputting anything. If such sequences can
be arbitrarily long, this sacrifices output streamability. A concrete
example for XML would be having an index at each element start that
indicates where the element ends in order to support the <a href="#accelerated-sequential-access"><b>5.1 Accelerated Sequential Access</b></a> property.</p><p>The buffer space requirement precludes some serialization techniques, e.g.,
compression over the whole document. This shows a trade-off between
streamability and the <a href="#compactness"><b>5.2 Compactness</b></a> property. The example
above indicates that building an element index for accelerated sequential
access on the fly may not be possible in all cases.</p><p>Streamability (both input and output) is always considered relative to a data
model. Once the data model is fixed, streamability is defined to be a
boolean property: a format/processor is either streamable or not
streamable.</p></div></div><div class="div2">
<h3><a id="support-for-error-correction" name="support-for-error-correction"></a>5.29 Support for Error Correction</h3><div class="div3">
<h4><a id="N10479" name="N10479"></a>5.29.1 Definition</h4><p>This property requires that error correcting codes can be applied to the
representation of XML data model instances. Error correcting codes applied
to a format: (a) enable to identify a section in which format errors can be
located and (b) enable to recover the undistorted section from the erroneous
one.</p><p>Representation formats of XML data model instances can be categorized in
three classes: (i) no partitioning of the representation is possible, (ii)
partitioning of the representation is possible and (iii) partitioning
according to the importance of the information is possible.</p></div><div class="div3">
<h4><a id="N10480" name="N10480"></a>5.29.2 Description</h4><p>Error correction requires that redundancy is contained in the format to allow
for recovery even if errors occured during transmission. The redundancy can
serve on one hand to identify that an error has occurred and in certain
circumstances to correct the error. </p><p>Various algorithms exist that insert redundancy so that a decoder is capable
of detecting and potentially correcting errors. These techniques are called
forward error correction (FEC) since they do not require further backward
communications between the receiver and the original sender. Examples of
block based FEC algorithms include Hamming and Reed-Solomon codes; an
example of a continual FEC is the Turbo Code.</p><p>In general the error correction requires methodologies known as channel
coding. Usually these algorithms are applied separately from those of source
coding for efficient, redundancy free information representation. Handling
them separately enables the adaptation of the channel coding, i.e., the
insertion of redundancy into the representation tuned to the expected
channel characteristics. For instance a channel with large error bursts
might require applying interleaving, i.e., a defined re-sorting of bits, so
the error bursts are distributed over a larger section of the bit stream.
This enables forward error correction mechanisms to be also applied in case
of channels with error bursts such as wireless channels.</p><p>To support for error correction an XML format has to interface with common
channel coding algorithms. For interfacing the format shall allow
partitioning of the representation according to the importance of the
represented information to support unequal error protection based on the
importance of the information. For instance in EPG data rights information
might be ranked more important than names of actors. Accordingly, being
<a href="#fragmentable"><b>5.12 Fragmentable</b></a> is a prerequisite to support error
correction.</p></div></div><div class="div2">
<h3><a id="transport-independence" name="transport-independence"></a>5.30 Transport Independence</h3><div class="div3">
<h4><a id="N10492" name="N10492"></a>5.30.1 Definition</h4><p>A format is transport independent if the only assumptions of transport
service are error-free and ordered delivery of messages without any
arbitrary restrictions on the message length.</p></div><div class="div3">
<h4><a id="N10497" name="N10497"></a>5.30.2 Description</h4><p>Formats should be independent from the transport service. A format must state
its assumptions (if any) about characteristics of the transport service in
addition to error-free and ordered delivery of messages without any
arbitrary restrictions on the message length. Protocol binding specifies how
a format is transmitted as payload in a specific transport (e.g., TCP/IP) or
messaging (e.g., HTTP) protocol.</p></div></div></div><div class="div1">
<h2><a id="additional-considerations" name="additional-considerations"></a>6 Additional Considerations</h2><div class="div2">
<h3><a id="forward-compatibility" name="forward-compatibility"></a>6.1 Forward Compatibility</h3><p>Forward compatibility supports the evolution to new versions of a format. XML has
changed very little while data models have continued to evolve. Additional
specifications have added numerous conventions for linking, schema definition,
and security. A format must support the evolution of data models and must allow
corresponding implementation of layered standards. Evolution of XML and its data
models could mean additional character encodings, additional
element/attribute/body structure, or new predefined behavior similar to ID
attributes. Examples might be more refined intra-document pointers, type or
hinting attribute standards, or support for deltas. An implementation should
indicate how certain classes of model changes might be implemented. This
resilience relates to properties like <a href="#extension-points"><b>5.10 Extension Points</b></a>,
<a href="#no-arbitrary-limits"><b>5.18 No Arbitrary Limits</b></a> and <a href="#format-version-identification"><b>5.11 Format Version Identification</b></a>.</p></div><div class="div2">
<h3><a id="implementation-cost" name="implementation-cost"></a>6.2 Implementation Cost</h3><p>A requirement on XML was "It shall be easy to write programs which process XML
documents." This property covers the implementation of a generic tool chain, but
not any application-specific processing code.</p><p>A low implementation cost may contribute to <a href="#widespread-adoption"><b>6.5 Widespread Adoption</b></a>
in that if tools to process the format need to be implemented as a part of an
application (e.g., because they do not exist for the target platform), a
low-cost format is more likely to be adopted. To fulfill this requirement the
format needs to be easy enough to implement so that this additional
implementation is not an impediment to the actual application development.
However, low implementation cost is not necessary to achieve widespread
adoption.</p><p>A rough estimate of implementation cost can be made by considering how much time
does it take for a solitary programmer to implement sufficiently robust
processing of the format (the so-called <em>Desperate Perl Hacker</em> measure). A proposed upper
limit in the case of XML was a couple of hacking sessions (though this limit has
proven to be too optimistic). An alternate format needs to do at least as well
as XML, and preferably better, to fulfill this requirement.</p><p>Another factor to consider is the kind of code that must be written to implement
the format. If either input or output require sophisticated algorithms (e.g.,
specialized compression not ubiquitously available), this increases the format's
implementation cost. If the format has several optional properties and, e.g.,
size-decreasing special serialization possibilities, the number of required
possible code paths in the processors increases. This makes the processors
harder to test comprehensively, and hence contributes to their fragility,
requiring more time for a robust implementation.</p><p>A lowering of implementation cost for an alternate representation can be achieved
by making its processing be XML-compatible at as low a level as possible. This
helps by making it possible to utilize existing XML-based infrastructure to a
larger extent. However, an incompatible interface at any level may allow more
efficient handling than an existing one that may be unnatural for the format,
thus justifying a higher implementation cost.</p></div><div class="div2">
<h3><a id="royalty-free" name="royalty-free"></a>6.3 Royalty Free</h3><p>Free in the context of an XML format means that is free to create and use an XML
format and the right to build tools and applications for the format is
completely unencumbered and royalty-free. </p><p>If the format is unencumbered and royalty-free it can be recommended by the W3C
and stands a better chance for adoption across the industry. These conditions
can positively affect the potential for ubiquitous use of the format. A free
format is also more likely to have free, open source code for processing it and
free tools for building applications which use it, especially when its <a href="#implementation-cost"><b>6.2 Implementation Cost</b></a> is low. This is another factor in the potential
for ubiquitous use of the format.</p></div><div class="div2">
<h3><a id="single-conformance-class" name="single-conformance-class"></a>6.4 Single Conformance Class</h3><p>There must be a single conformance class. Thus, a compliant implementation must
support all features defined in the specification —yet it is not
required to employ them all in every usage instance.</p><p>All implementations must demonstrate interoperability with a test suite testing
each feature. Having only a single conformance class tends to lower
implementation cost, decrease complexity for application development and support
ubiquitous implementation. In the past, <em>lite</em> versions of standards
have been created to support devices of limited capability. The need for these
provisos at the standard level is rapidly decreasing as even commodity devices
are quickly growing in capability.</p></div><div class="div2">
<h3><a id="widespread-adoption" name="widespread-adoption"></a>6.5 Widespread Adoption</h3><p>A format is more ubiquitous to the extent it has been implemented on a greater
range of computing devices (i.e., different architectures), on a greater number
of computing devices (i.e., millions of devices implementing a single
architecture), and used in a wider variety of applications. </p><p>While ubiquity can be measured for existing formats, it cannot be used as a point
of comparison with new formats which are, by definition, not yet ubiquitous.
Ubiquity may predicted by <a href="#implementation-cost"><b>6.2 Implementation Cost</b></a>. A low implementation cost
format is likely to create an environment where a very large community of
developers will be willing and able to create a critical mass of tools, with a
resultant feedback and amplification result that leads the marketplace toward
ubiquitous implementation. XML is considered by most to be a good of example
of a low cost/complexity format leading to ubiquity.</p><p>On the other hand, a high implementation cost format that can meet the
environmental constraints of a large number of devices, applications, and/or use
cases could also stand a good chance of becoming ubiquitous due to
economically-motivated industry commitment. Counterexamples where ubiquity was
obtained by trading off, to a greater or lesser extent, ease of implementation
include PDF and MPEG-3. While each of these certainly have significant costs of
implementation, each also addresses the needs of its user domain so well that
each has attained ubiquity in that domain.</p></div></div><div class="div1">
<h2><a id="N104E9" name="N104E9"></a>7 References</h2><dl><dt class="label"><a id="XBCWG" name="XBCWG"></a>XBC Use Cases</dt><dd>
<a href="http://www.w3.org/TR/xbc-use-cases/"><cite>XML
Binary Characterization Use Cases</cite></a>
(See http://www.w3.org/TR/xbc-use-cases/.)</dd><dt class="label"><a id="XML10" name="XML10"></a>XML 1.0</dt><dd>
<a href="http://www.w3.org/TR/REC-xml/"><cite>Extensible Markup Language (XML) 1.0</cite></a>
(See http://www.w3.org/TR/REC-xml/.)</dd><dt class="label"><a id="XML11" name="XML11"></a>XML 1.1</dt><dd>
<a href="http://www.w3.org/TR/xml11/"><cite>Extensible Markup Language (XML) 1.1</cite></a>
(See http://www.w3.org/TR/xml11/.)</dd><dt class="label"><a id="SGML" name="SGML"></a>ISO 8879</dt><dd>
<a href="http://www.iso.org/iso/en/CatalogueDetailPage.CatalogueDetail?CSNUMBER=16387"><cite>Standard Generalized Markup Language (SGML)</cite></a>
(See http://www.iso.org/iso/en/CatalogueDetailPage.CatalogueDetail?CSNUMBER=16387.)</dd><dt class="label"><a id="xml-infoset" name="xml-infoset"></a>XML Infoset</dt><dd>
<a href="http://www.w3.org/TR/xml-infoset/"><cite>XML
Information Set</cite></a>
(See http://www.w3.org/TR/xml-infoset/.)</dd><dt class="label"><a id="xml-schema-structures" name="xml-schema-structures"></a>Schema Part 1</dt><dd>
<a href="http://www.w3.org/TR/xmlschema-1/"><cite>XML
Schema Part 1: Structures</cite></a>
(See http://www.w3.org/TR/xmlschema-1/.)</dd><dt class="label"><a id="xquery-dm" name="xquery-dm"></a>XQuery DM</dt><dd>
<a href="http://www.w3.org/TR/xpath-datamodel/"><cite>XQuery
1.0 and XPath 2.0 Data Model</cite></a>
(See http://www.w3.org/TR/xpath-datamodel/.)</dd><dt class="label"><a id="xml-dsig" name="xml-dsig"></a>XML Digital Signature</dt><dd>
<a href="http://www.w3.org/TR/xmldsig-core/"><cite>XML-Signature Syntax and Processing</cite></a>
(See http://www.w3.org/TR/xmldsig-core/.)</dd><dt class="label"><a id="pdf-ref" name="pdf-ref"></a>PDF Reference</dt><dd>
<a href="http://partners.adobe.com/public/developer/pdf/index_reference.html"><cite>PDF
Reference</cite></a>
(See http://partners.adobe.com/public/developer/pdf/index_reference.html.)</dd><dt class="label"><a id="xml-fragment-interchange" name="xml-fragment-interchange"></a>XML Fragment Interchange</dt><dd>
<a href="http://www.w3.org/TR/2001/CR-xml-fragment-20010212"><cite>XML
Fragment Interchange</cite></a>
(See http://www.w3.org/TR/2001/CR-xml-fragment-20010212.)</dd></dl></div></div><div class="back"><div class="div1">
<h2><a id="N1058E" name="N1058E"></a>A Acknowledgments</h2><p>The properties have been gathered by the XBC Working Group contributors:
Robin Berjon (Expway), Carine Bournez (W3C), Don Brutzman (Web3D), Mike Cokus (MITRE), Roger Cutler (ChevronTexaco), Ed Day (Objective Systems), Fabrice Desré (France Telecom), Seamus Donohue (Cape Clear), Olivier Dubuisson (France Telecom), Oliver Goldman (Adobe), Peter Haggar (IBM), Takanari Hayama (KDDI), Jörg Heuer (Siemens), Misko Hevery (Adobe), Alan Hudson (Web3D), Takuki Kamiya (Fujitsu), Jaakko Kangasharju (University of Helsinki), Arei Kobayashi (KDDI), Eugene Kuznetsov (DataPower), Terence Lammers (Boeing), Kelvin Lawrence (IBM), Eric Lemoine (Tarari), Dmitry Lenkov (Oracle), Michael Leventhal (Tarari), Don McGregor (Web3D), Ravi Murthy (Oracle), Mark Nottingham (BEA), Santiago Pericas-Geertsen (Sun), Liam Quin (W3C), Kimmo Raatikainen (Nokia), Rich Salz (DataPower), Paul Sandoz (Sun), John Schneider (AgileDelta), Claude Seyrat (Expway), Paul Thorpe (OSS Nokalva), Alessandro Triglia (OSS Nokalva), Stephen D. Williams (Invited Expert).</p></div></div></body></html>