index.html
55.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Multimodal Interaction Use Cases</title>
<meta name="generator"
content="HTML Tidy for Linux/x86 (vers 1st April 2002), see www.w3.org" />
<meta http-equiv="CONTENT-TYPE"
content="text/html; charset=iso-8859-1" />
<style type="text/css">
/*<![CDATA[*/
body {
margin-left: 8%;
margin-right: 5%;
background-color: white;
font-family: Trebuchet, Arial, sans-serif
}
h1 { margin-left: -4%; color: rgb(0,92,160) }
h2 { margin-left: -4%; color: rgb(0,92,160)}
h3 { margin-left: 0% }
p.fig {text-align: center}
.c1 { display: none }
p.example { margin-left: 10% }
tr td { vertical-align: top }
//--> /*]]>*/
</style>
<link rel="stylesheet" type="text/css"
href="http://www.w3.org/StyleSheets/TR/W3C-NOTE" />
</head>
<body>
<div class="head">
<p><a href="http://www.w3.org/"><img height="48" alt="W3C"
src="http://www.w3.org/Icons/w3c_home" width="72" /></a></p>
<h1 class="notoc" id="name">Multimodal Interaction Use Cases</h1>
<h2 class="notoc" id="date">W3C NOTE 4 December 2002</h2>
<dl>
<dt>This version:</dt>
<dd><a
href="http://www.w3.org/TR/2002/NOTE-mmi-use-cases-20021204/">
http://www.w3.org/TR/2002/NOTE-mmi-use-cases-20021204/</a></dd>
<dt>Latest version:</dt>
<dd><a
href="http://www.w3.org/TR/mmi-use-cases/">http://www.w3.org/TR/mmi-use-cases/</a></dd>
<dt>Previous version:</dt>
<dd><i>this is the first publication</i></dd>
<dt>Editors:</dt>
<dd>Emily Candell, Dave Raggett</dd>
</dl>
<p class="copyright"><a
href="http://www.w3.org/Consortium/Legal/ipr-notice-20000612#Copyright">
Copyright</a> © 2002 <a href="http://www.w3.org/"><abbr
title="World Wide Web Consortium">W3C</abbr></a> <sup>®</sup> (
<a href="http://www.lcs.mit.edu/"><abbr
title="Massachusetts Institute of Technology">MIT</abbr></a>, <a
href="http://www.inria.fr/"><abbr lang="fr"
title="Institut National de Recherche en Informatique et Automatique">
INRIA</abbr></a>, <a href="http://www.keio.ac.jp/">Keio</a> ), All
Rights Reserved. W3C <a
href="http://www.w3.org/Consortium/Legal/ipr-notice-20000612#Legal_Disclaimer">
liability</a>, <a
href="http://www.w3.org/Consortium/Legal/ipr-notice-20000612#W3C_Trademarks">
trademark</a>, <a
href="http://www.w3.org/Consortium/Legal/copyright-documents-19990405">
document use</a>, and <a
href="http://www.w3.org/Consortium/Legal/copyright-software-19980720">
software licensing</a> rules apply.</p>
</div>
<hr title="Separator from Header" />
<h2 class="notoc" id="abstract">Abstract</h2>
<p>The W3C <a href="http://www.w3.org/2002/mmi/">Multimodal
Interaction Activity</a> is developing specifications as a basis
for a new breed of Web applications in which you can interact using
multiple modes of interaction, for instance, using speech, hand
writing, and key presses for input, and spoken prompts, audio and
visual displays for output. This document describes several use
cases for multimodal interaction and presents them in terms of
varying device capabilities and the events needed by each use case
to couple different components of a multimodal application.</p>
<h2 id="Status">Status of this Document</h2>
<p><em>This section describes the status of this document at the
time of its publication. Other documents may supersede this
document. The latest status of this document series is maintained
at the <abbr
title="the World Wide Web Consortium">W3C</abbr>.</em></p>
<p>W3C's <a href="http://www.w3.org/2002/mmi/">Multimodal
Interaction Activity</a> is developing specifications for extending
the Web to support multiple modes of interaction. This document
describes several use cases as the basis for gaining a better
understanding of the requirements for multimodal interaction, and
the kinds of information flows needed for multimodal
applications.</p>
<p>This document has been produced as part of the <a
href="http://www.w3.org/2002/mmi/">W3C Multimodal Interaction
Activity</a>,<span class="c1"><a
href="http://www.w3.org/2002/mmi/Activity.html"></a></span>
following the procedures set out for the <a
href="http://www.w3.org/Consortium/Process/">W3C Process</a>. The
authors of this document are members of the <a
href="http://www.w3.org/2002/mmi/Group/">Multimodal Interaction
Working Group</a> (<a
href="http://cgi.w3.org/MemberAccess/AccessRequest">W3C Members
only</a>). This is a Royalty Free Working Group, as described in
W3C's <a href="/TR/2002/NOTE-patent-practice-20020124">Current
Patent Practice</a> NOTE. Working Group participants are required
to provide <a href="http://www.w3.org/2002/01/mmi-ipr.html">patent
disclosures</a>.</p>
<p>Please send comments about this document to the public mailing
list: <a
href="mailto:www-multimodal@w3.org">www-multimodal@w3.org</a> (<a
href="http://lists.w3.org/Archives/Public/www-multimodal/">public
archives</a>). To subscribe, send an email to <<a
href="mailto:www-multimodal-request@w3.org">www-multimodal-request@w3.org</a>>
with the word <em>subscribe</em> in the subject line
(include the word <em>unsubscribe</em> if you want to
unsubscribe).</p>
<p>A list of current W3C Recommendations and other technical
documents including Working Drafts and Notes can be found at <a
href="http://www.w3.org/TR/">http://www.w3.org/TR/</a>.</p>
<h2 id="intro">1. Introduction</h2>
<p>Analysis of use cases provides insight into the requirements for
applications likely to require a multimodal infrastructure.</p>
<p>The use cases described below were selected for analysis in
order to highlight different requirements resulting from
application variations in areas such as device requirements, event
handling, network dependencies and methods of user interaction</p>
<p>It should be noted that although the results of this analysis we
be used as input to the Multimodal Specification being developed by
the W3C Multimodal Interaction Working Group, there is no guarantee
that all of these applications will be implementable using the
language defined in the specification.</p>
<h3 id="devices">1.1 Use Case Device Classification</h3>
<h4 id="thin">Thin Client</h4>
<p>A device with little processing power and capabilities that can
be used to capture user input (microphone, touch display, stylus,
etc) as well as non-user input such as GPS. The device may have a
very limited capability to interpret the input, for example a small
vocabulary speech recognition, or a character recognizer. The bulk
of the processing occurs on the server including natural language
processing and dialog management.</p>
<p>An example of such a device may be a mobile phone with DSR
capabilities and a visual browser (there could actually be thinner
clients than this).</p>
<h4 id="thick">Thick Client</h4>
<p>A device with powerful processing capabilities, such that most
of the processing can occur locally. Such a device is capable of
input capture and interpretation. For example, the device can have
a medium vocabulary speech recognizer, a handwriting recognizer,
natural language processing and dialog management capabilities. The
data itself may still be stored on the server.</p>
<p>An example of such a device may be a recent production PDA or an
in-car system.</p>
<h4 id="medium">Medium Client</h4>
<p>A device capable of input capture and some degree of
interpretation. The processing is distributed in a client/server or
a multidevice architecture. For example, a medium client will have
the voice recognition capabities to handle small vocabulary command
and control tasks but connects to a voice server for more advanced
dialog tasks.</p>
<h3 id="summaries">1.2 Use Case Summaries</h3>
<h4 id="table1">Table 1: <a href="#form-filling">Form Filling for
air travel reservation</a></h4>
<table border="1" cellpadding="5" summary="4 column table">
<tbody>
<tr>
<th>Description</th>
<th>Device Classification</th>
<th>Device Details</th>
<th>Execution Model</th>
</tr>
<tr>
<td>The means for a user to reserve a flight using a wireless
personal mobile device and a combination of input and output
modalities. The dialogue between the user and the application is
directed through the use of a form-filling paradigm.</td>
<td>Thin and medium clients</td>
<td>touch-enabled display (i.e., supports pen input), voice input,
local ASR and Distributed Speech Recognition Framework, local
handwriting recognition, voice output, TTS, GPS, wireless
connectivity, roaming between various networks.</td>
<td>Client Side Execution</td>
</tr>
</tbody>
</table>
<h5 id="form-filling-details">Scenario Details</h5>
<p>User wants to make a flight reservation with his mobile device
while he is on the way to work. The user initiates the service via
means of making a phone call to a multimodal service (telephone
metaphore) or by selecting an application (portal environment
metaphore). The details are not described here.</p>
<p>As the user moves between networks with very different
characteristics, the user is offered the flexibility to interact
using the preferred and most appropriate modes for the situation.
For example, while sitting in a train, the use of stylus and
handwriting can achieve higher accuracy than speech (due to
surrounding noise) and protect privacy. When the user is walking,
the input and output modalities that more appropriate would be
voice with some visual output. Finally, at the office the user can
use pen and voice in a synergistic way.</p>
<p>The dialogue between the user and the application is driven by a
form-filling paradigm where the user provides input to fields such
as "Travel Origin:", "Travel Destination:", "Leaving on date",
"Returning on date". As the user selects each field in the
application to enter information, the corresponding input
constraints are activated to drive the recognition and
interpretation of the user input. The capability of providing
composite multimodal input is also examined, where input from
multiple modalities is combined for the interpretation of the
user's intent.</p>
<h4 id="table2">Table 2: <a href="#driving-dir">Driving
Directions</a></h4>
<table border="1" cellpadding="5" summary="4 column table">
<tbody>
<tr>
<th>Description</th>
<th>Device Classification</th>
<th>Device Details</th>
<th>Execution Model</th>
</tr>
<tr>
<td>This application provides a mechanism for a user to request and
receive driving directions via speech and graphical input and
output</td>
<td>Medium Client</td>
<td>on-board system (in a car) with a graphical display, map
database, touch screen, voice and touch input, speech output, local
ASR and TTS Processing and GPS.</td>
<td>Client Side Execution</td>
</tr>
</tbody>
</table>
<h5 id="driving-direction-details">Scenario Details</h5>
<p>User wants to go to a specific address from his current location
and while driving wants to take a detour to a local restaurant (The
user does not know the restaurant address nor the name). The user
initiates service via a button on his steering wheel and interacts
with the system via the touch screen and speech.</p>
<h4 id="table3">Table 3: <a href="#name-dialing">Name
Dialing</a></h4>
<table border="1" cellpadding="5" summary="4 column table">
<tbody>
<tr>
<th>Description</th>
<th>Device Classification</th>
<th>Device Details</th>
<th>Execution Model</th>
</tr>
<tr>
<td>
<p>The means for users to call someone by saying their name.</p>
</td>
<td>
<p>thin and fat devices</p>
</td>
<td>
<p>Telephone</p>
</td>
<td>
<p>The study covers several possibilities:</p>
<ul>
<li>whether the application runs in the device or the server</li>
<li>whether the device supports limited local speech
recognition</li>
</ul>
<p>These choices determine the kinds of events that are needed to
coordinate the device and network based services.</p>
</td>
</tr>
</tbody>
</table>
<h5 id="name-dialing-details">Scenario Details</h5>
<p>Janet presses a button on her multimodal phone and says one of
the following commands:</p>
<ul>
<li>Call Wendy</li>
<li>Call Wendy on her cell phone</li>
<li>Call Wendy at work</li>
<li>Call Wendy Smith at Acme Research</li>
</ul>
<p>The application initially looks for a match in Janet's personal
contact list and if no match is found then proceeds to look in
other directories. Directed dialog and tapered help are used to
narrow down the search, using aural and visual prompts. Janet is
able to respond by pressing buttons, or tapping with a stylus, or
by using her voice.</p>
<p>Once a selection has been made, rules defined by Wendy are used
to determine how the call should be handled. Janet may see a
picture of Wendy along with a personalized message (aural and
visual) that Wendy has left for her. Call handling may depend on
the time of day, the location and status of the both parties, and
the relationship between them. An "ex" might be told to never call
again, while Janet might be told that Wendy will be free in half an
hour after Wendy's meeting has finished. The call may be
automatically directed to Wendy's home, office or mobile phone, or
Janet may be invited to leave a message.</p>
<h2 id="use-case-details">2. Use Case Details</h2>
<h3 id="form-filling">2.1 Use-case: Form filling for air travel
reservation</h3>
<p>Description: The air travel reservation use case describes a
scenario in which the user books a flight using a wireless personal
mobile device and a combination of input and output modalities.</p>
<p>The device has a touch-enabled display (i.e., supports pen
input) and it is voice enabled. The use case describes a rich
multimodal interaction model that allows the user to start a
session while commuting on the train, continue the interaction
while walking to his office and complete the transaction while sat
at his office-desk. As the user moves between environments with
very different characteristics, the user is given the opportunity
to interact using the preferred and most appropriate modes for the
situation. For example, while sitting in a train, the use of stylus
and handwriting can offer higher accuracy than speech (due to
noise) and protect privacy. When the user is walking, the input and
output modalities more appropriate would be voice with some visual
output. Finally, at the office the user can use pen and voice in a
synergistic way.</p>
<p>This example assumes the seamless transition through a variety
of connectivity options such as high bandwidth LAN at the office
(i.e., 802.11), lower bandwidth while walking (i.e., cellular
network such as GPRS) and low bandwidth but in addition
intermittent connectivity while on the train (e.g., can get
disconnected when going through a tunnel). The scenario also takes
advantage of network services such as location and time.</p>
<h4 id="form-filling-actors">Actors</h4>
<ul>
<li>User who makes the air travel reservation</li>
<li>Mobile device with touch-enabled display wireless network
connectivity, handwriting recognition capability and limited voice
recognition capability on the device.</li>
<li>Network service with full voice dialog capabilities, connection
to travel reservation database and location/time services.</li>
</ul>
<h4 id="form-filling-assumptions">Additional Assumptions</h4>
<ul>
<li>Data capabilities are available on the communications
provider's network. Voice requirements are satisfied either via
voice capabilities on the communications provider network or
through a DSR framework that utilized the existing data
capabilities.</li>
<li>There are means for describing user and device profile
information and means of exchanging this information between server
and client.</li>
</ul>
<h4 id="table4">Table 4: Event Table</h4>
<table border="1" cellpadding="5" summary="5 column table">
<tr>
<th>User Action</th>
<th>Action on device</th>
<th>Events sent from device</th>
<th>Action on server</th>
<th>Events sent From server</th>
</tr>
<tr>
<td>
<p>Device turned on</p>
</td>
<td>
<p>Registers with network and uploads delivery context [available
I/O modalities, bandwidth, user-specific info (e.g., home
city)]</p>
</td>
<td>
<p>register_device (delivery_context)</p>
</td>
<td>
<p>Complete session initiation by registering device and delivery
context (init_session)</p>
</td>
<td>
<p>register_ack</p>
</td>
</tr>
<tr>
<td>
<p>User picks travel app (taps with stylus or says travel)</p>
</td>
<td>
<p>Client side of application is started</p>
</td>
<td>
<p>app_connect (app_name)</p>
</td>
<td>
<p>Loads a page that is appropriate to current profile</p>
</td>
<td>
<p>app_connect_ack (start_page)</p>
</td>
</tr>
<tr>
<td colspan="5">
<p>Application is running and ready to take input. Origin city was
guessed from user profile or location service. User is o the train.
Active I/O modalities are pen, display and audio output.</p>
</td>
</tr>
<tr>
<td>
<p>User picks a field in the form to interact with the stylus</p>
</td>
<td>
<p>Destination field gets highlighted</p>
</td>
<td>
<p>on_focus (field_name)</p>
</td>
<td>
<p>Server loads the appropriate constraints for input on this
field. Constraints are sent to device for hwr.</p>
</td>
<td>
<p>listen_ack (field_grammar)</p>
</td>
</tr>
<tr>
<td>
<p>User starts writing. When he is finished</p>
</td>
<td>
<p>Handwriting recognition performed locally with visual and audio
presentation of result (i.e., earcon)</p>
</td>
<td>
<p> </p>
</td>
<td>
<p> </p>
</td>
<td>
<p> </p>
</td>
</tr>
<tr>
<td colspan="5">
<p>If recognition confidence is low, a different earcon is played
and pop-up menu of top-n hypotheses is displayed.</p>
</td>
</tr>
<tr>
<td>
<p>User approves result by moving to next field with stylus (e.g.,
departure time)</p>
</td>
<td>
<p>Result is submitted to server.</p>
<p> </p>
<p>Time field is highlighted.</p>
</td>
<td>
<p>submit_partial (destination)</p>
<p>on_focus (field_name)</p>
</td>
<td>
<p>Dialog state is updated. Appropriate constraints for input on
this field are loaded. Grammar constraints are sent to the
device</p>
</td>
<td>
<p>listen_ack (field_grammar)</p>
</td>
</tr>
<tr>
<td colspan="5">
<p>User gets off the train and starts walking - I/O modality is
voice only</p>
</td>
</tr>
<tr>
<td>
<p>User explicitly switches profile via button press, or through
non-user sensory input the profile is changed</p>
</td>
<td>
<p>Profile update - only voice enabled input with voice and visual
output</p>
</td>
<td>
<p>update (delivery_context)</p>
</td>
<td>
<p>Speech recognition and output module initialization.
Synchronization of dialog state between modalities. Audio prompt
"what time do you want to leave" is generated).</p>
</td>
<td>
<p>send (autio_prompt)</p>
</td>
</tr>
<tr>
<td>
<p>In response to audio prompt, user says "I want a flight in the
morning".</p>
</td>
<td>
<p>Audio is collected and sent it to server through data or voice
channel</p>
</td>
<td>
<p>send (audio)</p>
</td>
<td>
<p>Recognizes voice and generates list of hypotheses. Corresponding
audio prompt is created (e.g., "would you like to flight at 10 or
11 in the morning").</p>
</td>
<td>
<p>send (audio_prompt)</p>
</td>
</tr>
<tr>
<td colspan="5">
<p>While walking, field selection is either driven by the dialog
engine on the server, or by the user uttering simple phrases (e.g.,
voice graffiti)</p>
</td>
</tr>
<tr>
<td>
<p>User reaches his office.</p>
</td>
<td>
<p>User explicitly switches profile via button press, or through
non-user sensory input the profile is changed.</p>
</td>
<td>
<p>Events an handlers as previously for changing the delivery
context to accommodate interaction via voice, pen and GUI
selection</p>
</td>
<td> </td>
<td> </td>
</tr>
<tr>
<td colspan="5">
<p>At this point in the dialogue, it has been determined that there
are no direct flights between origin and destination. The
application displays available routes with in-between stops on a
map and the user is prompted to select one.</p>
</td>
</tr>
<tr>
<td>
<p>User says "I would like to take this one" while making a pen
gesture (i.e., circling over the preferred route)</p>
</td>
<td>
<p>Ink and audio are collected and sent to the server with time
stamp information.</p>
</td>
<td>
<p>send (audio)</p>
<p>send (ink)</p>
</td>
<td>
<p>Server receives the two inputs and integrates them into a
semantic representation</p>
<p>Server updates app with selection, acknowledging that input
integration was possible.</p>
</td>
<td>
<p>completeAck</p>
</td>
</tr>
<tr>
<td colspan="5">
<p>At this point in the dialog, payment authorization needs to be
made. User enters credit card information via voice, pen or
keypad.</p>
</td>
</tr>
<tr>
<td>
<p>User provides signature for authorization purposes</p>
</td>
<td>
<p>Ink is collected with information about pressure and tilt.</p>
</td>
<td>
<p>send (ink)</p>
</td>
<td>
<p>Server verifies signature.</p>
</td>
<td>
<p>DONE</p>
</td>
</tr>
</table>
<h3 id="driving-dir">2.2 Use-case: Driving Directions</h3>
<h4 id="driving-dir-assumptions">Assumptions</h4>
<ul>
<li>ASR services are local for simple requests (e.g. session
preference setup)</li>
<li>ASR is server-based for complex requests (e.g. addresses)</li>
<li>TTS local</li>
<li>Execution model is hosted on the device.</li>
<li>single language - with acknowledgement that we will ultimately
need language selection</li>
<li>availability (always on) - with acknowledgement that there may
be temporary interruptions due to unexpected circumstances (e.g.
tunnels, mountains)</li>
<li>driver is alone [cannot get assistance]</li>
<li>Additional applications may be available when the service is
initiated via a service selection menu (this is beyond the scope of
this use case analysis)</li>
<li>Initiating recognition requires a single button press. Button
press indicating end of speech is optional assuming with
preconfigured timeout to stop listening (requiring the user to hold
down a button while driving may be dangerous)</li>
<li>At any time during the session, the user may change display
options via the touch screen (includes zooming in and changing
route display options). Display options may also be changed using
speech by initiating a dialog by pressing the button on the
steering wheel</li>
</ul>
<h4 id="driving-dir-actors">Actors</h4>
<p>Primary Device:</p>
<ul>
<li>
<p>on-board system (in a car) with the following capabilities:</p>
<ul>
<li>graphical display:
<ul>
<li>maps</li>
<li>Estimated time of arrival</li>
<li>Textual Directions</li>
</ul>
</li>
<li>touch screen</li>
<li>voice (input and output)</li>
<li>keyboard/text input</li>
<li>local ASR and TTS processing</li>
<li>access to remote servers (ASR and App Server)</li>
<li>GPS</li>
</ul>
</li>
</ul>
<p>Data sources:</p>
<ul>
<li>route database</li>
<li>traffic conditions</li>
<li>GPS data</li>
<li>speedometer</li>
<li>landmarks database and places of interest:
<ul>
<li>nearest gas station</li>
<li>nearest restaurant of a specific type</li>
</ul>
</li>
<li>User Preference Database</li>
</ul>
<h4 id="driving-dir-walkthru">Scenario Walkthrough (User point of
view)</h4>
User preferences (These may be changed on a per session basis):
<ul>
<li>Primary Input: Speech</li>
<li>Secondary Input: Touch Screen</li>
<li>Speech and Graphical Output</li>
<li>Preferences are stored on the server to enable multiple users
to use the same device (Preferences may be retrieved automatically
based on speaker identification or key identification eliminating
the need for an authentication dialog)</li>
</ul>
<p>User wants to go to a specific address from his current location
and while driving wants to take a detour to a local restaurant (The
user does not know the restaurant address nor the name)</p>
<h4 id="table5">Table 5: Event Table</h4>
<table border="1" cellpadding="5" summary="5 column table">
<tr>
<th>User Action/External Input</th>
<th>Action on Device</th>
<th>Event Description</th>
<th>Event Handler</th>
<th>Resulting Action</th>
</tr>
<tr>
<td>User presses button on steering wheel</td>
<td>Service is initiated and GPS satellite detection begins</td>
<td>HTTP Request to app server</td>
<td>App server returns initial page to device</td>
<td>Welcome prompts are played. Authentication dialog is initiated
(may be initiated via speaker identification or key
identification).</td>
</tr>
<tr>
<td>User interacts in an authentication dialog</td>
<td>Device executes authentication dialog using local ASR
processing</td>
<td>HTTP Request to app server which includes user credentials</td>
<td>App server returns initial page to device including user
preferences</td>
<td>User is prompted for a destination (if additional services are
availble after authentication, assume that user selects driving
direction application)</td>
</tr>
<tr>
<td>Initial GPS Input</td>
<td>N/A</td>
<td>GPS_Data_In Event</td>
<td>Device handles location information</td>
<td>Device updates map on graphical display (assumes all maps are
stored locally on device)</td>
</tr>
<tr>
<td>User selects option to change volume of on-board unit using
touch display.</td>
<td>N/A</td>
<td>Touch_screen_event (includes x, y coordinates)</td>
<td>Touch screen detects and processes input</td>
<td>Volume indicator changes on screen. Volume of speech output is
changed</td>
</tr>
<tr>
<td>User presses button on steering wheel</td>
<td>Device initiates connection to ASR server</td>
<td>Start_Listening Event</td>
<td>ASR Server receives request and establishes connection</td>
<td>"listening" icon appears on display (utterances prior to
establishing the connection are buffered)</td>
</tr>
<tr>
<td>User says destination address (may improve recognition accuracy
by sending grammar constraints to server based on a local dialog
with the user instead of allowing any address from the start)</td>
<td>N/A</td>
<td>N/A</td>
<td>ASR Server processes speech and returns results to device</td>
<td>Device processes results and plays confirmation dialog to user
while highlighting destination and route on graphical display</td>
</tr>
<tr>
<td>User confirms destination</td>
<td>Device performs ASR Processing locally. Upon confirmation,
destination info is sent to app server</td>
<td>HTTP Request is sent to app server (includes current location
and destination information)</td>
<td>App Server processes input and returns data to device</td>
<td>Device processes results and updates graphical display with
route and directions highlighting next step</td>
</tr>
<tr>
<td>GPS Input at regular intervals</td>
<td>N/A</td>
<td>GPS_Data_In Event</td>
<td>Device processes location data and checks if location milestone
is hit</td>
<td>Device updates map on graphical display (assumes all maps are
stored locally on device) and highlights current step. When
milestone is hit, next instruction is played to user</td>
</tr>
<tr>
<td>GPS Input at regular intervals (indicating driver is off
course)</td>
<td>N/A</td>
<td>GPS_Data_In Event</td>
<td>Device processes location data and determines that user is off
course</td>
<td>Map on graphical display is updated and textual message is
displayed indicating that route is not correct. Prompt is played
from the device indicating that route is being recalculated</td>
</tr>
<tr>
<td>N/A</td>
<td>Route request is sent to app server including new location
data</td>
<td>HTTP Request is sent to app server (includes current location
and destination information)</td>
<td>App Server processes input and returns data to device</td>
<td>Device processes results and updates graphical display with
route and directions highlighting next step</td>
</tr>
<tr>
<td>Alert received on device based on traffic conditions</td>
<td>N/A</td>
<td>Route_Change Alert</td>
<td>Device processes event and initiates dialog to determine if
route should be recalculated</td>
<td>User is informed of traffic conditions and asked whether route
should be recalculated.</td>
</tr>
<tr>
<td>User requests recalculation of route based on current traffic
conditions</td>
<td>Device performs ASR Processing locally. Upon confirmation,
destination info is sent to app server</td>
<td>HTTP Request is sent to app server (includes current location
and destination information)</td>
<td>App Server processes input and returns data to device</td>
<td>Device processes results and updates graphical display with
route and directions highlighting next step</td>
</tr>
<tr>
<td>GPS Input at regular intervals</td>
<td>N/A</td>
<td>GPS_Data_In Event</td>
<td>Device processes location data and checks if location milestone
is hit</td>
<td>Device updates map on graphical display (assumes all maps are
stored locally on device) and highlights current step. When
milestone is hit, next instruction is played to user</td>
</tr>
<tr>
<td>User presses button on steering wheel</td>
<td>Connection to ASR server is established</td>
<td>Start_Listening Event</td>
<td>ASR Server receives request and establishes connection</td>
<td>User hears acknowledgement prompt for continuation, and
"listening" icon appears on display</td>
</tr>
<tr>
<td>User requests new destination by destination type while still
depressing button on steering wheel (may improve recognition
accuracy by sending grammar constraints to server based on a local
dialog with the user)</td>
<td>N/A</td>
<td>N/A</td>
<td>ASR Server processes speech and returns results to device</td>
<td>Device processes results and plays confirmation dialog to user
while highlighting destination and route on graphical display</td>
</tr>
<tr>
<td>User confirms destination via a multiple interaction dialog to
determine exact destination</td>
<td>Device executes dialog based on user responses (using local ASR
Processing) and accesses app server as needed</td>
<td>HTTP requests to app server for dialog and data specific to
user response</td>
<td>App server responds with appropriate dialog</td>
<td>User interacts in a dialog and selects destination. User is
asked whether this is a new destination</td>
</tr>
<tr>
<td>User indicates that this is a stop on the way to original
destination</td>
<td>Devices sends updated destination information to app
server</td>
<td>HTTP Request for updated directions (based on current location,
detour destination, and ultimate destination)</td>
<td>App Server processes input and returns data to device</td>
<td>Device processes results and updates graphical display with new
route and directions highlighting next step</td>
</tr>
<tr>
<td>GPS Input at regular intervals</td>
<td>N/A</td>
<td>GPS_Data_In Event</td>
<td>Device processes location data and checks if location milestone
is hit</td>
<td>Device updates map on graphical display (assumes all maps are
stored locally on device) and highlights current step. When
milestone is hit, next instruction is played to user</td>
</tr>
</table>
<h4 id="protocols">Protocols:</h4>
<ul>
<li>HTTP</li>
<li>Proprietary protocol for connection to ASR server?</li>
<li>GPS</li>
<li>Others</li>
</ul>
<h4 id="driving-dir-events">Events:</h4>
<ul>
<li>ASR Events</li>
<li>Touch Screen Events</li>
<li>GPS Updates</li>
<li>Refresh Triggers</li>
<li>Traffic Alerts</li>
<li>Others???</li>
</ul>
<h4 id="driving-dir-synch">Synchronization Issues:</h4>
<ul>
<li>Spoken Directions must be synchronized with current
location</li>
<li>When route changes while prompts are playing, current prompts
must be stopped and new prompts queued. This may be triggered by
the following:
<ul>
<li>BSW pressed by user</li>
<li>Screen is touched</li>
<li>Traffic Update event is received</li>
<li>Driver Error</li>
</ul>
</li>
<li>Screen must be updated to reflect current location and route.
This may be triggered by:
<ul>
<li>Refresh Event</li>
<li>Change of destination</li>
<li>Change of route</li>
<li>Driver Error</li>
</ul>
</li>
<li>Asynchronous events such as traffic updates need to be
synchronized with explicit user requests including:
<ul>
<li>Route change requests</li>
<li>Display/Output Preference change requests</li>
</ul>
</li>
<li>Others???</li>
</ul>
<h4 id="driving-dir-latency">Latency Concerns</h4>
<ul>
<li>Unanticipated app Server delays may cause directions to be
inaccurate</li>
</ul>
<h4 id="driving-dir-considerations">Scenario Considerations</h4>
<p>Input Information:</p>
<ul>
<li>Starting address/location:
<ul>
<li>explicit street address</li>
<li>current location obtained via GPS</li>
<li>landmark or place of interest</li>
</ul>
</li>
<li>Ending address/location:
<ul>
<li>explicit street address</li>
<li>landmark or place of interest</li>
</ul>
</li>
<li>Traffic Conditions</li>
<li>General preferences:
<ul>
<li>highway vs. scenic route</li>
<li>time vs. distance</li>
<li>style of output (graphical, turn-by-turn, etc...)</li>
<li>units of output (miles vs. kilometers)</li>
</ul>
</li>
</ul>
<p>Possible Devices:</p>
<ul>
<li>Phone with display</li>
<li>Phone without display (voice only)</li>
<li>In-dash system (GPS, ASR, TTS)</li>
<li>PC</li>
<li>PDA</li>
<li>Phone (voice + data)</li>
<li>UMTS</li>
</ul>
<p>Available Technologies:</p>
<ul>
<li>Communication (2.5G, 3G)</li>
<li>Display (Y/N)</li>
<li>Application run-time environment (BREW, J2ME, etc)</li>
<li>Server access</li>
</ul>
<p>Data sources:</p>
<ul>
<li>route database</li>
<li>traffic conditions</li>
<li>location [GPS]</li>
<li>speed and time of arrival [GPS, speedometer]</li>
<li>landmarks database and places of interest:
<ul>
<li>nearest gas station</li>
<li>nearest restaurant of a specific type</li>
</ul>
</li>
<li>User Preference Database</li>
</ul>
<p>Output Mechanisms:</p>
<ul>
<li>graphical (map)</li>
<li>text description</li>
<li>voice</li>
<li>fax</li>
<li>dynamic updates (recalculation based on traffic information,
driver error, etc...)</li>
<li>single delivery of results vs. multiple/sequential delivery of
results as needed</li>
</ul>
<h3 id="name-dialing">2.3 Use Case: Multimodal Name Dialling Use
Case</h3>
<h4 id="name-dialing-overview">Overview</h4>
<p>The Name Dialing use case describes a scenario in which users
can say a name into their mobile terminals and be connected to the
named person based on the called party's availability for that
caller.</p>
<p>If the called user is not available, the calling user may be
given the choice of either leaving a message on the called user's
voicemail system or sending an email to the called user. The called
user may provide a personalized message for the caller, including,
for example, "Don't ever call me again!"</p>
<p>The called user is given the opportunity of selecting which
device the call should be routed to, e.g. work, mobile, home, or
voice mail. This may be dependent on the time of day, the called
user's location, and the identity of the calling user.</p>
<p>The use case assumes a rich model of name dialling as an example
of a premium service exploiting a range of information such as
personal and network directories, location, presence, buddy lists
and personalization features.</p>
<p>The benefits of making this a multimodal interacton include the
ability to view and listen to information about the called user,
and to be able to use a keypad or stylus, as an alternative to
using voice as part of the name selection process.</p>
<h4 id="name-dialing-actors">Actors</h4>
<ul>
<li>
<p>Caller — user who wishes to place a call</p>
</li>
<li>
<p>Called user — user who wishes control over how incoming
calls are handled</p>
</li>
<li>
<p>Mobile display phone with a lightweight client browser, and
optional speaker-dependent minimal speech recognition
capabilities</p>
</li>
<li>
<p>Network based directory service with speech recognition
capabilities, this provides support for looking up names in
personal contact lists, as well as in corporate and public
directories</p>
</li>
<li>
<p>Network based unified messaging service with provision for
composing, transferring and playing back messages, including
personalized messages intended for specific callers</p>
</li>
<li>
<p>User profile database with presence information, buddy lists,
and personalized call handling rules</p>
</li>
</ul>
<h4 id="name-dialing-assumptions">Assumptions</h4>
<p>The user has a device with a button that is pushed to place a
call. The device has recording capabilities. [voice activation is
power hungry and unreliable in noisy environments]</p>
<p>Both voice and data capabilities are available on the
communications provider's network (not necessarily as
simultaneously active modes).</p>
<p>If the phone supports speech recognition and there is a local
copy of the personal phone contact list, then the user's spoken
input is first recognized against the local directory for a
possible match and if unsuccessful, the request is extended back to
the directory provider.</p>
<p>The directory provider has access to a messaging service and to
user profiles and presence information. The directory provider thus
knows the whereabouts of each registered user - on the phone, at
work, unavailable etc.</p>
<p>The directory provider enforces access control rules to ensure
individual and corporate privacy. This isn't explored in this use
case.</p>
<p>People can be identified by personal names like "Wendy" or by
nick names or aliases. The personal contact list provides a means
for subscribers to define their own aliases, and to limit the scope
of search (there are a lot of Wendy's worldwide).</p>
<p>There is a user agent on the client device with an XHTML browser
and optional speaker-dependent speech recognition capabilities.</p>
<p>There is a client server relationship between the user agent on
the device and the directory provider.</p>
<p>The dialog could be driven from either the client device or from
the network. This doesn't effect the user view, but does alter the
events used to coordinate the two systems. This will be explored in
a later section.</p>
<p>The Name Dialing use case will be described through the
following views:</p>
<h4 id="name-dialing-user-view">User view</h4>
<p>User pushes a button and says</p>
<pre>
"Call Wendy Smith"
</pre>
<p>It is also possible to say such things as:</p>
<pre>
"Call Wendy"
"Call Wendy Smith at work".
"Call Wendy at home".
"Call Wendy Smith on her mobile phone".
</pre>
<p>Multiple scenarios are possible here:</p>
<p>If local recognition is supported, the utterance will be first
processed by a local name dialling application. If there is no
match, the recorded utterance is forwarded to a network based name
dialling application.</p>
<p>The user's personal contact list will take priority over the
corporate and public directories. This is independent of whether
the personal list is held locally in the device or in the
network.</p>
<p>The following situations can arise when the user says a
name:</p>
<ol>
<li>
<p>Single match — the caller is presented with information
about the callee. This may include a picture taken from the
callee's profile. The caller is asked for a confirmation before the
call is put through.</p>
</li>
<li>
<p>Multiple matches — if the number of matches is small
(perhaps five or less), the caller is asked to choose from the
list. This is be presented to the caller via speech and accompanied
with a display of a list of names and pictures. The caller can
then:</p>
<ul>
<li>Use a button on the phone to select a list item.</li>
<li>Point or touch a link on the screen in the presented list.</li>
<li>Say index number or expanded name from the presented list.</li>
</ul>
<p>A further alternative is to say "that one" as the system speaks
each item in the list in sequence. This method is offered in case
the user needs hands and eyes free operation, or the device is
incapable of displaying the list</p>
</li>
<li>
<p>Lot's of matches, for example, when the caller says a common
name. The caller is led through a directed dialog to narrow down
the search.</p>
</li>
<li>
<p>No recognition — the recognizer wasn't able to find a
match. The user could have failed to say anything, or there could
have been too much noise. A tapered help mechanism is invoked.
Callers could be asked to repeat themselves, or asked to key in the
number or speak it digit by digit.</p>
</li>
</ol>
<p>Assuming that the user successfully makes a selection:</p>
<ul>
<li>
<p>The system retrieves further information on the called user such
as the current location and local time of that user. The
information presented may depend on the relationship between the
called and calling users. This assumes support for a buddy list and
presence capability. The called user may specify her availability
for specific individuals or groups of would be callers depending on
time of day etc.</p>
</li>
<li>
<p>Two scenarios are described here:</p>
<ol>
<li>
<p>The system finds that the called person is currently available.
A picture and/or sound bite is provided to the caller. The system
places the call and the user is connected to Wendy Smith.</p>
<p><b>Post condition</b>: The user is in a call with the intended
party.</p>
</li>
<li>
<p>The system finds that the called person is unavailable. The
system attempts to connects to the called user's voicemail
system.</p>
<p>Assuming this succeeds, the system plays the following prompt
back to the caller: "Wendy Smith is currently unavailable. She has
left this message for you."</p>
<p>The message is played out. It could be a multimedia message with
recorded sound, text, pictures and even short video clips.</p>
<p>The system plays a prompt back - "Would you like to leave a
message?"</p>
<p>The user says "Yes".</p>
<p>The user is then connected to the voicemail system and leaves a
message for Wendy Smith.</p>
<p>If Wendy's voicemail box is full or unavailable, the system
offers the caller the chance of composing an email. This occupies
the caller's storage allocation until it has been sent.</p>
<p><b>Post condition</b>: The user has left a message for the
intended party.</p>
</li>
</ol>
</li>
</ul>
<p>The availability of the called user may depend on the time of
day, whether the called user is away from her work or home
location, and who the calling user is. For example, when travelling
you may want to take calls on your mobile during the day. Don't you
hate it when people call you in the middle of the night because
they don't realize what timezone you are in! You may want to make
an exception for close friends and family members. There may also
be some people whom you never want to accept calls from, not even
voice messages!</p>
<p>When a user is notified of an incoming call, the device may
present information on the caller including a photograph, name,
sound bite, location and local time information, depending on the
relationship between the caller and callee. The user then has an
opportunity to accept the call or to divert it to voice mail.</p>
<h4 id="name-dialing-provider-view">Directory provider View</h4>
<ul>
<li>
<p>The client on the user device records the spoken input. The
spoken input is recognized against the directory on client device.
When this fails, the utterance is extended to the directory
provider for recognition.</p>
<p>If the user device doesn't support local recognition, it may
still need to record the utterance, so that the user can start
talking immediately without needing to wait for the connection to
the directory provider to be completed.</p>
</li>
<li>
<p>The directory provider retrieves the profile for the calling
user. This has information on which device the user is calling
from, the current location of the calling user etc. The calling
user is authenticated and authorized.</p>
</li>
<li>
<p>The recognizer in the provider recognizes the spoken utterance
and returns the result. This result can either be a single entry or
a list of possible close matches.</p>
</li>
<li>
<p>The server application (hosting the directory provider) now
controls the flow of the interaction henceforth.</p>
</li>
<li>
<p>The server goes to the database and retrieves more information
based on the recognizer result.</p>
</li>
<li>
<p>The provider queries the presence of the called user, and
personalization information (buddy list, location and presence
information, etc.) to construct the content for the response.</p>
</li>
<li>
<p>A result may be returned back to the client device in more than
one way here:</p>
<p>A single XHTML page is constructed with both visual picture and
audio with the complete name of the recognized match.</p>
<p>The feedback can include two channels such as visual for the
picture and a separate voice channel for playing back the name of
the user (an optimization for reduced latency).</p>
</li>
<li>
<p>The server creates and transfers a composed page to the client
device.</p>
</li>
<li>
<p>Once the client receives the content from the application
server, multiple scenarios are possible here based on the
recognizer result. See user view for details.</p>
</li>
<li>
<p>Picking a choice from a list can be done by voice, button or
stylus. The user should be able to browse the list, and to revisit
the list upon rejecting a confirmation of a preceding choice.</p>
<p>Example: user says "Call the first one". This utterance is
processed by the directory provider to select the first match.</p>
</li>
<li>
<p>The directory application may need to apply a directed dialog to
narrow the search when there are more than a few matches, or when
recognition and tapered help needs to be offered.</p>
</li>
</ul>
<h4 id="name-dialing-initiative">What is driving the dialog?</h4>
<p>The details of the events depend on whether the dialog is being
driven from the network or from the user device.</p>
<p>When the device sends a spoken utterance to the server, the user
may have spoken a name such as "Tom Smith" or spoken a command such
as "the last one". If the directory search is being driven by the
user device, the server's response is likely to be a short list of
matches, or a command or error code. To support the application,
the server would provide a suite of functions, including the means
for the device to set the recognition context, the ability to play
specific prompts, and to download information on named users.</p>
<p>If the network is driving the dialog, the device sends the
spoken utterance in the same way, but the responses are actions to
update the display and local state. If the caller presses a button
or uses a stylus to make a selection, this event will be sent to
the server. The device and server could exchange low level events,
such as a stylus tap at a given coordinate, or higher level events
such as which name the user has selected from the list.</p>
<h4 id="table6">Table 6: Event Table</h4>
<table border="1" cellpadding="5" summary="5 column table">
<tr valign="top">
<th width="10%">
<p>User action</p>
</th>
<th width="25%">
<p>Action on device</p>
</th>
<th width="20%">
<p>Events sent from device</p>
</th>
<th width="25%">
<p>Action on server</p>
</th>
<th width="20%">
<p>Events sent from server</p>
</th>
</tr>
<tr valign="top">
<td>
<p>Turns on the device</p>
</td>
<td>
<p>Registers with the Directory Provider through the operator in
the NW and downloads the personal directory</p>
</td>
<td>
<p>register user (userId)</p>
</td>
<td>
<p>Directory Provider gets register information, updates user's
presence and location info, loads user's personal info (buddy list,
personal directory,...)</p>
</td>
<td>
<p>acknowledgement + personal directory</p>
<p class="comment">In practice, SyncML would be used to reduce net
traffic</p>
</td>
</tr>
<tr valign="top">
<td>
<p>Pushes a button to place a call</p>
</td>
<td>
<p>Local reco initialized, activates the personal directory</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>Displays a prompt</p>
<p>"Please say a name"</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td>
<p>Speaks a name</p>
</td>
<td>
<p>Local recognition against personal directory</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td colspan="5" valign="top">
<p>a) If grammar matches:</p>
</td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>Display the name or namelist (see following table)</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td>
<p>Confirms by pressing the call button again if 1 name is
displayed, or selects a name on the list (see following table)</p>
</td>
<td>
<p>Fetches the number from the personal directory</p>
</td>
<td>
<p>call(userID, number)</p>
</td>
<td>
<p>Checks the location and presence status of the called party</p>
</td>
<td>
<p>call ok(picture)<br />
OR<br />
called party not available</p>
</td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>if call ok, displays the picture and places a call,</p>
<p>if called party not available, displays/plays a corresponding
prompt about leaving a message or sending an e-mail</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td colspan="5" valign="top">
<p>i) if user chooses to leave a message:</p>
</td>
</tr>
<tr valign="top">
<td>
<p>User agrees to leave a message by pressing a suitable button</p>
</td>
<td>
<p>Initializes the recording, displays a prompt to start the
recording</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td>
<p>User speaks and ends by pressing a suitable button</p>
</td>
<td>
<p>Closes the recording, sends the recording to the Directory
Provider app</p>
</td>
<td>
<p>leave message(userID, number, recording)</p>
</td>
<td>
<p>Stores the message for the called party</p>
</td>
<td>
<p>message ok</p>
</td>
</tr>
<tr>
<td colspan="5" valign="top">
<p>ii) if user chooses to send an e-mail:</p>
</td>
</tr>
<tr valign="top">
<td>
<p>User selects 'send e-mail' option by pressing a suitable
button</p>
</td>
<td>
<p>Starts an e-mail writing application</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td>
<p>Writes e-mail</p>
</td>
<td>
<p>Fetches the e-mail address from the personal directory, sends
e-mail, closes the e-mail app</p>
</td>
<td>
<p>send mail(userID, mail address, text)</p>
</td>
<td>
<p>Sends the e-mail to the called party</p>
</td>
<td>
<p>mail ok</p>
</td>
</tr>
<tr>
<td colspan="5" valign="top">
<p>b) if personal grammar does not match:</p>
</td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>sends the utterance to be recognized in the network</p>
</td>
<td>
<p>send(userID, utterance)</p>
</td>
<td>
<p>Recognition against public directory</p>
</td>
<td>
<p>reco ok(namelist)</p>
<p>OR</p>
<p>reco nok</p>
</td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>if reco ok, displays the name or namelist (more details in
following table), activates local reco with the index list if more
than one name,</p>
<p>if reco nok, display/play a message to the user</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td>
<p>Confirms by pressing the call button again if 1 name is
displayed, or selects a name on the list (see following table)</p>
</td>
<td>
<p>Selection received (perhaps spoken index recognized first)</p>
</td>
<td>
<p>call(userID, number)</p>
</td>
<td>
<p>Checks the location ... [continues as described above]</p>
</td>
<td> </td>
</tr>
</table>
<h4 id="table7">Table 7: Interaction details of displaying and
confirming the recognition results</h4>
<table border="1" cellpadding="7" summary="5 column table">
<tr valign="top">
<th width="10%">
<p>User action</p>
</th>
<th width="25%">
<p>Action on device</p>
</th>
<th width="20%">
<p>Events sent from device</p>
</th>
<th width="25%">
<p>Action on server</p>
</th>
<th width="20%">
<p>Events sent from server</p>
</th>
</tr>
<tr>
<td colspan="5" valign="top">
<p>... speaker utterance has been processed by the recogniser</p>
</td>
</tr>
<tr>
<td colspan="5" valign="top">
<p>A. Very high confidence, unique match, auto confirmation (NB! I
would recommend letting the user confirm this explicitly; this
would also make the application behaviour seem more consistent to
the user since some kind of confirmation would be needed every
time)</p>
</td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>Displays the name and shows/plays clear prompt "Calling ..."</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>Fetches the number</p>
</td>
<td>
<p>call(userID, number)</p>
</td>
<td>
<p>Checks the location and presence status of the called party</p>
</td>
<td>
<p>call ok(picture)</p>
<p>OR</p>
<p>called party not available</p>
</td>
</tr>
<tr>
<td colspan="5" valign="top">
<p>B. High confidence, unique match, explicit confirmation</p>
</td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>Displays the name and picture, prompt asking "Place a call?"</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td>
<p>Confirms by pressing the call button again</p>
</td>
<td>
<p>Fetches the number</p>
</td>
<td>
<p>call(userID, number)</p>
</td>
<td>
<p>Checks the location and presence status of the called party</p>
</td>
<td>
<p>call ok(picture)</p>
<p>OR</p>
<p>called party not available</p>
</td>
</tr>
<tr>
<td colspan="5" valign="top">
<p>C. High confidence with several matching entries, or medium
confidence with either unique match or several matching entries</p>
</td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>Displays the namelist with indexes, activates index grammar on
local reco; if multiple entries with same spelling, additional info
should be added on the list</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td>
<p>Selects a name by speaking the index or navigating to the
correct name with keypad and pressing the call button</p>
</td>
<td>
<p>Fetches the number</p>
</td>
<td>
<p>call(userID, number)</p>
</td>
<td>
<p>Checks the location and presence status of the called party</p>
</td>
<td>
<p>call ok(picture)</p>
<p>OR</p>
<p>called party not available</p>
</td>
</tr>
<tr>
<td colspan="5" valign="top">
<p>D. Low confidence, no match from the directory/ies</p>
</td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>Prompts "Not found, please try again"</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td>
<p>User speaks the name again</p>
</td>
<td>
<p>New recognition, on 2<sup>nd</sup> or 3<sup>rd</sup> 'nomatch',
change the prompt to ~ "Sorry, no number found"</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
</table>
<h4 id="table8">Table 8: No local recognition, all recognition in
the Network</h4>
<table border="1" cellpadding="7" summary="">
<tr valign="top">
<th width="10%">
<p>User action</p>
</th>
<th width="25%">
<p>Action on device</p>
</th>
<th width="20%">
<p>Events sent from device</p>
</th>
<th width="25%">
<p>Action on server</p>
</th>
<th width="20%">
<p>Events sent from server</p>
</th>
</tr>
<tr valign="top">
<td>
<p>Turns on the device</p>
</td>
<td>
<p>Registers with the Directory Provider through the operator in
the NW</p>
</td>
<td>
<p>register user(userID)</p>
</td>
<td>
<p>Directory Provider gets register information, updates user's
presence and location info, loads user's personal info (buddy list,
personal directory,...)</p>
</td>
<td>
<p>register ack</p>
</td>
</tr>
<tr valign="top">
<td>
<p>Pushes a button to place a call</p>
</td>
<td> </td>
<td>
<p>init reco(userID)</p>
</td>
<td>
<p>Activates the personal directory and public directory</p>
</td>
<td>
<p>reco init ok</p>
</td>
</tr>
<tr valign="top">
<td> </td>
<td>
<p>Displays a prompt</p>
<p>"Please say a name"</p>
</td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr valign="top">
<td>
<p>Speaks a name</p>
</td>
<td>
<p>Sends the utterance to be recognized in the network</p>
</td>
<td>
<p>send(userID, utterance)</p>
</td>
<td>
<p>Recognition against personal directory first, if no match there
with confidence greater than some threshold, then against public
directory</p>
</td>
<td>
<p>reco ok(namelist)</p>
<p>OR</p>
<p>reco nok</p>
</td>
</tr>
</table>
<h2 id="acknowledgements">3. Acknowledgements</h2>
<p>The following people contributed to this document:</p>
<ul>
<li>Paulo Baggia, Loquendo</li>
<li>Art Barstow, Nokia</li>
<li>Emily Candell, Comverse</li>
<li>Debbie Dahl, Consultant and Working Group Chair</li>
<li>Stephen Potter, Microsoft</li>
<li>Vlad Sejnoha, Scansoft</li>
<li>Luc Van Tichelin, Scansoft</li>
<li>Tasos Anastasakos, Motorola</li>
<li>Lin Chen, Voice Genie</li>
<li>Jim Larson, Intel Architecture Lab</li>
<li>T.V. Raman, IBM</li>
<li>Derek Schwenke, Mitsubishi Electric</li>
<li>Giovanni Seni, Motorola</li>
<li>Dave Raggett, W3C/Openwave</li>
<li>Bennett Marks, Nokia</li>
<li>Katriina Halonen, Nokia</li>
<li>Ramalingam Hariharan, Nokia</li>
<li>Stephane Maes, IBM</li>
<li>Purush Yeluripati</li>
<li>Kuansan Wang, Microsoft</li>
</ul>
</body>
</html>