hash_uris.html 50.8 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549
<?xml version="1.0"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  <head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <style type="text/css" media="all">
    @import "/QA/2006/01/blogstyle.css";
    </style>
    <meta name="keywords" content='http, uri' />
    <meta name="description" content="There's been quite a bit of discussion recently about the use of hash-bang URIs following their adoption by Gawker, and the ensuing downtime of that site. The TAG at the W3C have also been drafting a document on Repurposing the Hash Sign for the New Web which takes a rather wider view than just the hash-bang issue, and on which they are seeking comments.

All matters of design involve weighing different choices against some criteria that you decide on implicitly or explicitly: there is no single right way of doing things on the web. Here, I explore the choices that are available to web developers around hash URIs and discuss how to mitigate the negative aspects of adopting the hash-bang pattern.
" />
    <meta name="revision" content="$Id: hash_uris.html,v 1.22 2011/12/16 03:00:16 gerald Exp $" />    
   <link rel="alternate" type="application/atom+xml" title="Atom" href="http://www.w3.org/QA/atom.xml" />
   <link rel="alternate" type="application/rss+xml" title="RSS 1.0" href="http://www.w3.org/QA/news.rss" />   
   <title>Hash URIs - W3C Blog</title>

   <link rel="start" href="http://www.w3.org/QA/" title="Home" />
   <link rel="prev" href="http://www.w3.org/QA/2011/05/rdf_interfaces_published.html" title="RDF Interfaces published…" />
   <link rel="next" href="http://www.w3.org/QA/2011/05/web-tracking-test-cases.html" title="Web Tracking and User Privacy Workshop: Test Cases for Privacy on the Web " />

   <!--
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:trackback="http://madskills.com/public/xml/rss/module/trackback/"
         xmlns:dc="http://purl.org/dc/elements/1.1/">
<rdf:Description
    rdf:about="http://www.w3.org/QA/2011/05/hash_uris.html"
    trackback:ping="http://www.w3.org/QA/sununga/mt-tb.cgi/392"
    dc:title="Hash URIs"
    dc:identifier="http://www.w3.org/QA/2011/05/hash_uris.html"
    dc:subject="Web Architecture"
    dc:description="There&apos;s been quite a bit of discussion recently about the use of hash-bang URIs following their adoption by Gawker, and the ensuing downtime of that site. The TAG at the W3C have also been drafting a document on Repurposing the Hash Sign for the New Web which takes a rather wider view than just the hash-bang issue, and on which they are seeking comments.

All matters of design involve weighing different choices against some criteria that you decide on implicitly or explicitly: there is no single right way of doing things on the web. Here, I explore the choices that are available to web developers around hash URIs and discuss how to mitigate the negative aspects of adopting the hash-bang pattern.
"
    dc:creator="Jeni Tennison"
    dc:date="2011-05-12T18:17:18+00:00" />
</rdf:RDF>
-->

    <!-- <script type="text/javascript" src="http://www.w3.org/QA/mt.js"></script>-->

</head>
<body class="layout-one-column">
      <div id="banner">
      <h1 id="title">
	<a href="http://www.w3.org/"><img height="48" alt="W3C" id="logo" src="http://www.w3.org/Icons/WWW/w3c_home_nb" /></a>
W3C Blog
</h1>
    </div>
    
    <ul class="navbar" id="menu">
        <li><strong><a href="/QA/" title="W3C Blog Home">[ W3C Blog ]</a></strong></li>
        <li><a href="/QA/Library/" title="Documents and Publications on Web and Quality">Documents</a></li>
        <li><a href="/QA/Tools/" accesskey="3" title="Validators and other Tools">Tools</a></li>
        <li><a href="/2007/12/qa-blog-help/index#feedback">Feedback</a></li>
    </ul>
<div id="searchbox">
<form method="get" action="http://www.google.com/custom" enctype="application/x-www-form-urlencoded">
<p id="formbox"><input type="text" size="15" class="textfield" name="q" accesskey="E" maxlength="255" /> <input type="submit" class="submitfield" value="Search" id="goButton" name="sa" accesskey="G" /> <input type="hidden" name="cof" value="T:black;LW:72;ALC:#ff3300;L:http://www.w3.org/Icons/w3c_home;LC:#000099;LH:48;BGC:white;AH:left;VLC:#660066;GL:0;AWFID:0b9847e42caf283e;" /><input type="hidden" id="searchW3C" name="sitesearch" checked="checked" value="www.w3.org/QA" /><input type="hidden" name="domains" value="www.w3.org/QA" /></p>
</form>
</div>


    <div id="main"><!-- This DIV encapsulates everything in this page - necessary for the positioning -->

                     <p class="content-nav">
                        <a href="http://www.w3.org/QA/2011/05/rdf_interfaces_published.html">&laquo; RDF Interfaces published…</a> |
                        <a href="http://www.w3.org/QA/">Main</a>
                        | <a href="http://www.w3.org/QA/2011/05/web-tracking-test-cases.html">Web Tracking and User Privacy Workshop: Test Cases for Privacy on the Web  &raquo;</a>
                     </p>

                        <h2 class="entry-header">Hash URIs</h2>
                           <div class="entry-body">
                              <p><em>Note: This was initially posted at <a href="http://www.jenitennison.com/blog/node/154">http://www.jenitennison.com/blog/node/154</a>.</em></p>

<p>There&#8217;s been quite a bit of discussion recently about the use of <a href="http://code.google.com/web/ajaxcrawling/docs/getting-started.html">hash-bang URIs</a> following their <a href="http://blogs.wsj.com/digits/2011/02/07/gawker-outage-causing-twitter-stir/">adoption by Gawker, and the ensuing downtime of that site</a>.</p>

<p>Gawker have redesigned their sites, including <a href="http://lifehacker.com/">lifehacker</a> and various others, such that all URIs look like <code>http://{domain}#!{path-to-content}</code> &#8212; the <code>#!</code> is the hash-bang. The home page on the domain serves up a static HTML page that pulls in Javascript that interprets the <code>path-to-content</code> and requests that content through AJAX, which it then slots into the page. The sites all suffered an outage when, for whatever reason, the Javascript couldn&#8217;t load: without working Javascript you couldn&#8217;t actually view any of the content on the site.</p>

<p>This provoked a massive cry of #FAIL (or perhaps that should be #!FAIL) and a lot of puns along the lines of making a hash of a website and it going bang. For analysis and opinions on both sides, see:</p>

<ul>
<li><a href="http://isolani.co.uk/blog/javascript/BreakingTheWebWithHashBangs">Breaking the Web with hash-bangs by Mike Davies</a></li>
<li><a href="http://www.tbray.org/ongoing/When/201x/2011/02/09/Hash-Blecch">Broken Links by Tim Bray</a></li>
<li><a href="http://blog.benward.me/post/3231388630">Hash, Bang, Wallop by Ben Ward</a></li>
<li><a href="http://blog.tomgibara.com/post/3214368343/hash-bang-boom">Hash-bang boom by Tom Gibara</a></li>
<li><a href="http://www.adequatelygood.com/2011/2/Thoughts-on-the-Hashbang">Thoughts on the Hashbang by Ben Cherry</a></li>
<li><a href="http://lists.w3.org/Archives/Public/www-tag/2011Feb/0095.html">Nathan&#8217;s comments on www-tag</a></li>
</ul>

<p>While all this has been going on, the <a href="http://www.w3.org/2001/tag/">TAG at the W3C</a> have been drafting a document on <a href="http://www.w3.org/2001/tag/2011/02/HashInURI-20110228.html">Repurposing the Hash Sign for the New Web</a> (originally named <a href="http://www.w3.org/TR/2009/WD-hash-in-uri-20090415/">Usage Patterns For Client-Side URI parameters</a> in April 2009) which takes a rather wider view than just the hash-bang issue, and on which they are seeking comments.</p>

<p>All matters of design involve weighing different choices against some criteria that you decide on implicitly or explicitly: there is no single right way of doing things on the web. Here, I explore the choices that are available to web developers around hash URIs and discuss how to mitigate the negative aspects of adopting the hash-bang pattern.</p>

<h2>Background</h2>

<p>The semantics of hash URIs have changed over time. Look back at <a href="http://tools.ietf.org/html/rfc1738">RFC 1738: Uniform Resource Locators (URL)</a> from December 1994 and fragments are hardly mentioned; when they are, they are termed &#8220;fragment/anchor identifiers&#8221;, reflecting their original use which was to jump to an anchor within an HTML page (indicated by an <code>&lt;a&gt;</code> element with a <code>name</code> attribute; those were the days).</p>

<p>Skip to <a href="http://tools.ietf.org/html/rfc2396">RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax</a> from August 1998 and <a href="http://tools.ietf.org/html/rfc2396#section-4.1">fragment identifiers</a> have their own section, where it says:</p>

<blockquote>
  <p>When a URI reference is used to perform a retrieval action on the identified resource, the optional fragment identifier, separated from the URI by a crosshatch (&#8220;#&#8221;) character, consists of additional reference information to be interpreted by the user agent after the retrieval action has been successfully completed.  As such, it is not part of a URI, but is often used in conjunction with a URI.</p>
</blockquote>

<p>At this point, the fragment identifier:</p>

<ul>
<li>is not part of the URI</li>
<li>should be interpreted in different ways based on the mime type of the representation you get when you retrieve the URI</li>
<li>is only meaningful when the URI is actually retrieved and you know the mime type of the representation</li>
</ul>

<p>Forward to <a href="http://tools.ietf.org/html/rfc3986">RFC 3986: Uniform Resource Identifier (URI): Generic Syntax</a> from January 2005 and fragment identifiers are defined as part of the URI itself:</p>

<blockquote>
  <p>The fragment identifier component of a URI allows indirect identification of a secondary resource by reference to a primary resource and additional identifying information. The identified secondary resource may be some portion or subset of the primary resource, some view on representations of the primary resource, or some other resource defined or described by those representations.</p>
</blockquote>

<p>This breaks away from the tight coupling between a fragment identifier and a representation retrieved from the web and purposefully allows the use of hash URIs to define abstract or real-world things, addressing <a href="http://www.w3.org/2001/tag/issues.html#abstractComponentRefs-37">TAG Issue 37: Definition of abstract components with namespace names and frag ids</a> and supporting the use of <a href="http://www.w3.org/TR/cooluris/#hashuri">hash URIs in the semantic web</a>.</p>

<p>Around the same time, we have the growth of AJAX, where a <a href="http://itsnat.sourceforge.net/php/spim/spi_manifesto_en.php">single page interface</a> is used to access a wide set of content which is dynamically retrieved using Javascript. The AJAX experience could be frustrating for end users, because the back button no longer worked (to let them go back to previous states of their interface) and they couldn&#8217;t bookmark or share state. And so applications started to <a href="http://www.contentwithstyle.co.uk/content/fixing-the-back-button-and-enabling-bookmarking-for-ajax-apps">use hash URIs to track AJAX state</a> (that article is from June 2005, if you&#8217;re following the timeline).</p>

<p>And so we get to hash-bangs. These were <a href="http://googlewebmastercentral.blogspot.com/2009/10/proposal-for-making-ajax-crawlable.html">proposed by Google</a> in October 2009 as a mechanism to distinguish between cases where hash URIs are being used as anchor identifiers, to describe views, or to identify real-world things, and those cases where they are being used to capture important AJAX state. What Google proposed is for <strong>pages where the content of the page is determined by a fragment identifier and some Javascript</strong> to <em>also</em> be accessible by combining the base URI with a query parameter (<code>_escaped_fragment_={fragment}</code>). To distinguish this use of hash URIs from the more mundane kinds, Google proposed starting the fragment identifier <code>#!</code> (hash-bang). Hash-bang URIs are therefore associated with the practice of <a href="http://en.wikipedia.org/wiki/Transclusion">transcluding</a> content into a wrapper page.</p>

<p>To summarise, hash URIs are now being used in three distinct ways:</p>

<ol>
<li>to identify parts of a retrieved document</li>
<li>to identify an abstract or real-world thing (that the document says something about)</li>
<li>to capture the state of client-side web applications</li>
</ol>

<p>Hash-bang URIs are a particular form of the third of these. By using them, the website indicates that the page uses client-side transclusion to give the true content of the page. If it follows Google&#8217;s proposal, the website also commits to making that content available through an equivalent base URI with a <code>_escaped_fragment_</code> parameter.</p>

<h2>Hash-bang URIs in practice</h2>

<p>Let&#8217;s have a look at how hash-bang URIs are used in a couple of sites.</p>

<h3>Lifehacker</h3>

<p>First, we&#8217;ll look at lifehacker, which is one of Gawker&#8217;s sites whose switch to hash-bangs triggered the recent spate of comments. What happens if I link to the article <a href="http://lifehacker.com/#!5770791/top-10-tips-and-tricks-for-making-your-work-life-better"><code>http://lifehacker.com/#!5770791/top-10-tips-and-tricks-for-making-your-work-life-better</code></a>?</p>

<p>The exact response to this request seems to depend on some cookies (it didn&#8217;t work the first time I accessed it in Firefox, having pasted the link from another browser). If it works as expected, in a browser that supports Javascript, the browser gets the page at the base URI <a href="http://lifehacker.com/"><code>http://lifehacker.com/</code></a>, which includes (amongst a <em>lot</em> of other things) a script that <code>POST</code>s to <a href="http://lifehacker.com/index.php?_actn_=ajax_post"><code>http://lifehacker.com/index.php?_actn_=ajax_post</code></a> a request with the data:</p>

<pre><code>op=ajax_post
refId=5770791
formToken=d26bd943151005152e6e0991764e6c09
</code></pre>

<p>The response to this <code>POST</code> is a 53kB JSON document that contains a bit of metadata about the post and then its escaped HTML content. This gets inserted into the page by the script, to display the post. As this isn&#8217;t a <code>GET</code>table resource, I&#8217;ve <a href="/blog/files/lifehacker.json">attached this file</a> to this post so you can see what it looks like.</p>

<p>(Honestly, I could hardly bring myself to describe this: a <code>POST</code> to get some data? a <code>.php</code> URL? query parameter set to <code>ajax_post</code>? massive amounts of escaped HTML in a JSON response? Geesh. Anyway, focus&#8230; hash-bang URIs&#8230;)</p>

<p>A browser that doesn&#8217;t support Javascript simply gets the base URI and is none the wiser about the actual content that was linked to.</p>

<p>What about the <code>_escaped_fragment_</code> equivalent URI, <a href="http://lifehacker.com/?_escaped_fragment_=5770791/top-10-tips-and-tricks-for-making-your-work-life-better"><code>http://lifehacker.com/?_escaped_fragment_=5770791/top-10-tips-and-tricks-for-making-your-work-life-better</code></a>? If you request this, you get back an <code>200 OK</code> response which is an HTML page with the content embedded in it. It looks just the same as the original page with the embedded content.</p>

<p>What if you make up some rubbish URI, which in normal circumstances you would expect to give a <code>404 Not Found</code> response? Naturally, a request to the base URI of <code>http://lifehacker.com/</code> is always going to give a <code>200 OK</code> response, although if you try <a href="http://lifehacker.com/#!1234/made-up-page"><code>http://lifehacker.com/#!1234/made-up-page</code></a> you get page furniture with no content in the page. A request to <a href="http://lifehacker.com/?_escaped_fragment_=1234/made-up-page"><code>http://lifehacker.com/?_escaped_fragment_=1234/made-up-page</code></a> results in a <code>301 Moved Peramently</code> to the hash-bang URI <a href="http://lifehacker.com/#!1234"><code>http://lifehacker.com/#!1234</code></a> rather than the <code>404 Not Found</code> that we&#8217;d want.</p>

<h3>Twitter</h3>

<p>Now let&#8217;s look at Twitter. What happens if I link to the tweet <a href="http://twitter.com/#!/JeniT/status/35634274132561921"><code>http://twitter.com/#!/JeniT/status/35634274132561921</code></a>? Although it&#8217;s not indicated in the <code>Vary</code> header, Twitter determines what to do about any requests to this hashless URI based on whether I&#8217;m logged in or not (based on a cookie).</p>

<p>If I am logged on, I get the new home page. This home page <code>GET</code>s (through various iframes and Javascript obfuscation) several small JSON files through Twitter&#8217;s API: </p>

<ul>
<li><a href="http://api.twitter.com/1/statuses/show.json?include_entities=true&amp;contributor_details=true&amp;id=35634274132561921"><code>http://api.twitter.com/1/statuses/show.json?include_entities=true&amp;contributor_details=true&amp;id=35634274132561921</code></a>: the details of the tweet</li>
<li><a href="http://api.twitter.com/1/statuses/35634274132561921/retweeted_by.json?count=15"><code>http://api.twitter.com/1/statuses/35634274132561921/retweeted_by.json?count=15</code></a>: details about retweets</li>
<li><a href="http://api.twitter.com/1/users/lookup.json?user_id=&amp;screen_name=unhosted"><code>http://api.twitter.com/1/users/lookup.json?user_id=&amp;screen_name=unhosted</code></a>: details about the twitter user <a href="http://twitter.com/unhosted">@unhosted</a>, who was mentioned in the tweet</li>
</ul>

<p>This JSON gets converted into HTML and embedded within the page using Javascript. All the links within the page are to hash-bang URIs and there is no way of identifying the hashless URI (unless you know the very simple pattern that you can simply remove it to get a static page).</p>

<p>If I&#8217;m not logged on but am using a browser that understands Javascript, the browser GETs <code>http://twitter.com/</code>; the script in the returned page picks out the fragment identifier and redirects (using Javascript) to <a href="http://twitter.com/JeniT/status/35634274132561921"><code>http://twitter.com/JeniT/status/35634274132561921</code></a>.</p>

<p>If, on the other hand, I&#8217;m using curl or a browser without Javascript activated, I just get the home page and have no idea that the original hash-bang URI was supposed to give me anything different.</p>

<p>The response to the hashless URI <a href="http://twitter.com/JeniT/status/35634274132561921"><code>http://twitter.com/JeniT/status/35634274132561921</code></a> also varies based on whether I&#8217;m logged in or not. If I am, the response is a <code>302 Found</code> to the hash-bang URI <a href="http://twitter.com/#!/JeniT/status/35634274132561921"><code>http://twitter.com/#!/JeniT/status/35634274132561921</code></a>. If I&#8217;m not, for example using curl, Twitter just returns a normal HTML page that contains information about the tweet that I&#8217;ve just requested.</p>

<p>Finally, if I request the <code>_escaped_fragment_</code> version of the hash-bang URI <a href="http://twitter.com/?_escaped_fragment_=/JeniT/status/35634274132561921"><code>http://twitter.com/?_escaped_fragment_=/JeniT/status/35634274132561921</code></a> the result is a <code>301 Moved Permanently</code> redirection to the hashless URI <a href="http://twitter.com/JeniT/status/35634274132561921"><code>http://twitter.com/JeniT/status/35634274132561921</code></a> which can be retrieved as above.</p>

<p>Requesting a status that doesn&#8217;t exist such as <a href="http://twitter.com/#!/JeniT/status/1"><code>http://twitter.com/#!/JeniT/status/1</code></a> in the browser results in a page that at least tells you the content doesn&#8217;t exist. Requesting the equivalent <code>_escaped_fragment_</code> URI redirects to the hashless URI <a href="http://twitter.com/JeniT/status/1"><code>http://twitter.com/JeniT/status/1</code></a>. Requesting this results in a <code>404 Not Found</code> result as you would expect.</p>

<h2>Advantages of Hash URIs</h2>

<p>Why are these sites using hash-bang URIs? Well, hash URIs in general have four features which make them useful to client-side applications: they provide addresses for application states; they give caching (and therefore performance) boosts; they enable web applications to draw data from separate servers; and they may have SEO benefits.</p>

<h3>Addressing</h3>

<p>Interacting with the web is all about moving from one state to another, through clicking on links, submitting forms, and otherwise taking action on a page.</p>

<p>Backend databases on web servers, cookies, and other forms of <a href="http://www.w3.org/TR/webstorage/">local storage</a> provide methods of capturing application state, but on the web we&#8217;ve found that having <strong>addresses</strong> for states is essential for a whole bunch of things that we find useful:</p>

<ul>
<li>being able to use the <strong>back button</strong> to return to previous states</li>
<li>being able to <strong>bookmark</strong> states that we want to return to in the future</li>
<li>being able to <strong>share</strong> states with other people by linking to them</li>
</ul>

<p>On the web, the only addressing method that meets these goals is the URI. Addresses that involve more than a URI, such as &#8220;search <code>http://example.com/</code> with the keyword X and click on the third link&#8221; or &#8220;access <code>http://example.org/</code> with cookie X set to Y&#8221; or &#8220;access <code>http://example.net</code> with the HTTP header X set to Y&#8221; simply don&#8217;t work. You can&#8217;t bookmark them or link to them or put them on the side of a bus.</p>

<p>Application state is complex and multi-faceted. As a web developer, you have to work out which parts of the application state need to be addressable through URIs, which can be stored on the client and which on a server. They can be classified into four rough categories; states that are associated with:</p>

<ol>
<li>having particular <strong>content</strong> in the page, such as having a particular thread open in a webmail application</li>
<li>viewing a particular <strong>part</strong> of the content, such as a particular message within a thread that is being shown in the page</li>
<li>having a particular <strong>view</strong> of the content, such as which folders in a navigational folder list are collapsed or expanded</li>
<li>a <strong>user-interface feature</strong>, such as whether a drop-down menu is open or closed</li>
</ol>

<p>States that have different content almost certainly need to have different URIs so that it&#8217;s possible to link to that content (the web being nothing without links). At the other extreme, it&#8217;s very unlikely that the state of a drop-down menu would need to be captured at all. In between is a large grey area, where a web developer might decide not to capture state at all, to capture it in the client, in the server, or to make it addressable by giving it a URI.</p>

<p>If a web developer chooses to make a state addressable through a URI, they again have choices to make about which part of the URI to use: should different states have different domains? different paths? different query parameters? different fragment identifiers? Hash URIs make states addressable that developers might otherwise leave unaddressable.</p>

<p>To give some examples, on <a href="http://www.legislation.gov.uk/">legislation.gov.uk</a> we have decided to:</p>

<ul>
<li>use the path to indicate a particular piece of content (eg which section of an item of legislation you want to look at), for example <a href="http://www.legislation.gov.uk/ukpga/1985/67/section/6"><code>/ukpga/1985/67/section/6</code></a></li>
<li>use query parameters for particular views on that content (eg whether you want to see the timeline associated with the section or not), for example <a href="http://www.legislation.gov.uk/ukpga/1985/67/section/6?view=timeline&amp;timeline=true"><code>/ukpga/1985/67/section/6?view=timeline&amp;timeline=true</code></a></li>
<li>use fragment identifiers to jump to subsections, for example <a href="http://www.legislation.gov.uk/ukpga/1985/67/section/6#section-6-2"><code>/ukpga/1985/67/section/6#section-6-2</code></a></li>
<li>also use fragment identifiers for enhanced views (eg when viewing a section after a text search) <a href="http://www.legislation.gov.uk/ukpga/1985/67/section/6#text%3Dschool%20bus"><code>/ukpga/1985/67/section/6#text%3Dschool%20bus</code></a></li>
</ul>

<p>The last of these states would probably have gone un-addressed if we couldn&#8217;t use a hash URI for it. The only changes that it makes to the normal page are currently to the links to other legislation content, so that you can go (back) to a highlighted table of contents (though we hope to expand it to provide in-section highlighting). Given that we rely heavily on caching to provide the performance that we want and that there&#8217;s an infinite variety of free-text search terms, it&#8217;s simply not worth the performance cost of having a separate base URI for those views.</p>

<h3>Caching and Parallelisation</h3>

<p>Fragment identifiers are currently the only part of a URI that can be changed without causing a browser to refresh the page (though see the note below). Moving to a different base URI &#8212; changing its domain, path or query &#8212; means making a new request on the server. Having a new request for a small change in state makes for greater load on the server and a worse user experience due both to the latency inherent in making new requests and the large amount of repeated material that has to be sent across the wire.</p>

<blockquote>
  <p><em>Note: HTML5 introduces <a href="http://www.w3.org/TR/html5/history.html#the-history-interface"><code>pushState()</code> and <code>changeState()</code> methods</a> in its history API that enable a script to add new URIs to the browser&#8217;s history without the browser actually navigating to that page. This is new functionality, at time of writing only supported in Chrome, Safari and Firefox (and not completely in any of them) and unlikely to be included in IE9. When this functionality is more widely adopted, it will be possible to change state to a new base URI without causing a page load.</em></p>
</blockquote>

<p>When a change of state involves simply viewing a different part of existing content, or viewing it in a different way, a hash URI is often a reasonable solution. It supports addressability without requiring an extra request.</p>

<p>Things become fuzzier when the same base URI is used to support different content, where transclusion is used. In these cases, the page that you get when you request the base URI itself gets content from the server as one or more separate AJAX requests based on the fragment identifier. Whether this ends up giving better performance depends on a variety of factors, such as:</p>

<ul>
<li><strong>How large are the static portions of the page (served directly) compared to the dynamic parts (served using AJAX)?</strong> If the majority of the content is static as a user moves through the site, you&#8217;re going to benefit from only loading the dynamic parts as state changes.</li>
<li><strong>Can different portions of the page be requested in parallel?</strong> These days, <a href="http://calendar.perfplanet.com/2010/thoughts-on-performance/">making many small requests may lead to better performance than one large one</a>.</li>
<li><strong>Can the different portions of the page be cached locally or in a <acronym title="content-delivery network">CDN</acronym>?</strong> You can make best use of caches if the rapidly changing parts of a page are requested separately from the slowly changing parts.</li>
</ul>

<h3>Distributed Applications</h3>

<p>Hash URIs can also be very useful in distributed web applications, where the code that is used to provide an interface pulls in data from a separate, unconnected source. Simple examples are mashups that use data provided by different sources, requested using AJAX, and combine that data to create a new visualisation.</p>

<p>But more advanced applications are beginning to emerge, particularly as a reaction to silo sites such as Google and Facebook, which lock us in to their applications by controlling our data. From the <a href="http://www.unhosted.org/manifesto.html">unhosted manifesto</a>:</p>

<blockquote>
  <p>To be unhosted, a website&#8217;s code will need to be very ajaxy first, so that all the servers do is store and serve json data. No server-side processing. This is because we need to switch from transport-layer encryption to client-side payload encryption (we no longer necessarily trust the server we&#8217;re talking to). From within the app&#8217;s source code, that should run entirely in JavaScript and HTML5, json-objects can be stored, retrieved, sent, and received. The user will have the same experience (we even managed to avoid needing a plugin), but the website is unhosted in the sense that the servers you talk to only see encrypted data and don&#8217;t even know which application you are running.</p>
</blockquote>

<p>The aim of unhosted is to separate application code from user data. This divides servers (at least functionally) into those that store and make available user data, and those that host applications and any supporting code, images and so on. The important feature of these sites is that user data never passes through the web application&#8217;s server. This frees users to move to different applications without losing their data.</p>

<p>This doesn&#8217;t necessarily stop the application server from doing <em>any</em> processing, including URI-based processing; it is only that the processing cannot be based on user data &#8212; the content of the site. Since this content is going to be accessed through AJAX anyway, there&#8217;s little motivation for unhosted applications to use anything other than local storage and hash URIs to encode state.</p>

<h3>SEO</h3>

<p>A final reason for using hash URIs that I&#8217;ve seen cited is that it increases the page rank for the base URI, because as far as a search engine is concerned, more links will point to the same base URI (even if in fact they are pointing to a different hash URI). Of course this doesn&#8217;t apply to hash-bang URIs, since the point of them is precisely to enable search engines to distinguish between (and access content from) URIs whose base URI is the same.</p>

<h2>Disadvantages of Hash URIs</h2>

<p>So hash-bangs can give a performance improvement (and hence a usability improvement), and enable us to build new kinds of web applications. So what are the arguments against using them?</p>

<h3>Restricted Access</h3>

<p>The main disadvantages of using hash URIs generally to support AJAX state arise due to them having to be interpreted by Javascript. This immediately causes problems for:</p>

<ul>
<li>users who have chosen to turn off Javascript because:
<ul>
<li>they have bandwidth limitations</li>
<li>they have security concerns</li>
<li>they want a calmer browser experience</li>
</ul></li>
<li>clients that don&#8217;t support Javascript at all such as:
<ul>
<li>search engines</li>
<li>screen scrapers</li>
</ul></li>
<li>clients that have buggy Javascript implementations that you might not have accounted for such as:
<ul>
<li>older browsers</li>
<li>some mobile clients</li>
</ul></li>
</ul>

<p>The most recent statistic I could find, about access to the <a href="http://developer.yahoo.com/blogs/ydn/posts/2010/10/how-many-users-have-javascript-disabled/">Yahoo home page</a> indicates that up to 2% of access is from users without Javascript (they excluded search engines). According to a <a href="http://webaim.org/projects/screenreadersurvey3/#javascript">recent survey</a>, about the same percentage of screen reader users have Javascript turned off.</p>

<p>This is a low percentage, but if you have large numbers of visitors it adds up. The site that I care most about, <a href="http://legislation.gov.uk">legislation.gov.uk</a>, has over 60,000 human visitors a day, which means that about 1,200 of them will be visiting without Javascript. If our content were completely inaccessible to them we&#8217;d be inconveniencing a large number of users.</p>

<h3>Brittleness</h3>

<p>Depending on hash-bang URIs to serve content is also brittle, as Gawker found. If the Javascript that interprets the fragment identifier is temporarily inaccessible or unable to run in a particular browser, any portions of a page that rely on Javascript also become inaccessible.</p>

<h3>Replacing HTTP</h3>

<p>There are other, less obvious, impacts which occur when you use a hash-bang URI.</p>

<p>The URI held in the <a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36">HTTP Referer header</a> &#8220;MUST NOT include a fragment&#8221;. As <a href="http://isolani.co.uk/blog/javascript/BreakingTheWebWithHashBangs">Mike Davies noted</a>, this prevents such URIs from showing up in server logs, and stops people from working out which of your pages are linking to theirs. (Of course, this might be a good thing in some circumstances; there might be aspects of the state of a page that you&#8217;d rather a referenced server not know about.)</p>

<p>You should also consider the impact on the future proofing of your site. When a server knows the entirety of a URI, it can use HTTP mechanisms to indicate when pages have moved, gone, or never existed. With hash URIs, if you change the URIs you use on your site, the Javascript that interprets the fragment identifier needs to be able to recognise and support any redirections, missing, or never existing pages. The HTTP status code for the wrapper page will always be <code>200 OK</code>, but be meaningless.</p>

<p>Even if your site structure doesn&#8217;t change, if you use hash-bang URIs as your primary set of URIs, you&#8217;re likely to find it harder to make a change back to using hashless URIs in the future. Again, you will be reliant in perpetuity on Javascript routing to decipher the hash-bang URI and redirect it to a hashless URI.</p>

<h3>Lack of Differentiation</h3>

<p>A final factor is that fragment identifiers can become overcrowded with state information. In a purely hash-URI-based site, what if you wanted to jump to a particular place within particular content, shown with a particular view? The hash URI has to encode all three of these pieces of information. Once you start using hash-bang URIs, there is no way to indicate within the URI (for search engines, for example) that a particular piece of the URI can be ignored when checking for equivalence. With normal hash URIs, there is an assumption that the fragment identifier can basically be ignored; with hash-bang URIs that is no longer true.</p>

<h2>Good Practice</h2>

<p>Having looked at the advantages and disadvantages, I would echo what seems to be the general sentiment around traditional server-based websites that use hash-bang URIs: <strong>pages that give different content should have different base URIs, not just different fragment identifiers</strong>. In particular, if you&#8217;re serving large amounts of document-oriented content through hash-bang URIs, consider swapping things around and having hashless URIs for the content that then transclude in the large headers, footers and side bars that form the static part of your site.</p>

<p>However, if you are running a server-based, data-driven web application and your primary goal is a smooth user experience, it&#8217;s understandable why you might want to offer hash URIs for your pages to the 98% of people who can benefit from it, even for transcluded content. In these cases I&#8217;d argue that you should practice progressive enhancement:</p>

<ol>
<li>support hashless URIs which <em>do not</em> simply redirect to a hash URI, and design your site around those</li>
<li>use hash-bang URIs as suggested by Google rather than simple hash URIs</li>
<li>provide an easy way to get the sharable, hashless URI for a particular page when it is accessed with a hash-bang URI</li>
<li>use hashless URIs within links; these can be overridden with onclick listeners for those people with Javascript; using the hashless URI ensures that &#8216;Copy Link Location&#8217; will give a sharable URI</li>
<li>use the HTML5 history API where you can to add or replace the relevant hashless URI in the browser history as state changes</li>
<li>ensure that only those visitors that both have Javascript enabled and do not have support for HTML5&#8217;s history API have access to the hash-bang URIs by using Javascript to, for example:
<ul>
<li>redirect to a hash-bang URI</li>
<li>rewrite URIs within pages to hash-bang URIs</li>
<li>attach onclick URIs to links</li>
</ul></li>
<li>support the <code>_escaped_fragment_</code> query parameter, the result of which should be a redirection to the appropriate hashless URI</li>
</ol>

<p>This is roughly what Twitter has done, except that it doesn&#8217;t make it easy to get the hashless URI from a page or from links within the page. Of course the mapping in Twitter&#8217;s case is the straight-forward removal of the <code>#!</code> from the URI, but as a human it&#8217;s frustrating to have to do this by hand.</p>

<p>The above measures ensure that your site will remain as accessible as possible to all users and provides a clear migration path as the HTML5 history API gains acceptance. The slight disadvantage is that encouraging people to use hashless URIs for links means that you you can no longer depend quite so much on caching as the first page that people access in a session might be any page (whereas with a pure hash-bang scheme everyone goes to the same initial page).</p>

<p>Distributed, client-based websites can take the same measures &#8212; the application&#8217;s server can send back the same HTML page regardless of the URI used to access it; Javascript can pull information from a URI&#8217;s path as easily as it can from a fragment identifier. The biggest difficulty is supporting the static page through the <code>_escaped_fragment_</code> convention without passing user data through the application server. I suspect we might find a third class of service arise: trusted third-party proxies using headless browsers to construct static versions of pages without storing either data or application logic. Time will tell.</p>

<h2>The Deeper Questions</h2>

<p>There are some deeper issues here regarding web architecture. In the traditional web, there is a one-to-one correspondence between the representation of a resource that you get in response to a request from a server, and the content that you see on the page (or a search engine retrieves). With a traditional hash URI for a fragment, the HTTP headers you retrieve for the page are applicable to the hash URI as well. In a web application that uses transclusion, this is not the case.</p>

<blockquote>
  <p><em>Note: It&#8217;s also impossible to get metadata about hash URIs used for real-world or abstract things using HTTP; in these cases, the metadata about the thing can only be retrieved through interpreting the data within the page (eg an RDF document). Whereas with the <code>303 See Other</code> pattern for publishing linked data, it&#8217;s possible to use a <code>404 Not Found</code> response to indicate a thing that does not exist, there is no equivalent with hash URIs. Perhaps this is what lies at the root of my feeling of unease about them.</em></p>
</blockquote>

<p>With hash-bang URIs, there are in fact three (or more) URIs in play: the hash-bang URI (which identifies a wrapper page with particular content transcluded within it), a base URI (which identifies the wrapper HTML page) and one or more content URIs (against which AJAX requests are made to retrieve the relevant content). Requests to the base URI and the content URIs provide us with HTTP status codes and headers that describe those particular representations. The only way of discovering similar metadata about the hash-bang URI itself is through the <code>_escaped_fragment_</code> query parameter convention which maps the hash-bang URI into a hashless URI that can be requested.</p>

<p>Does this matter? Do hash-bang URIs &#8220;break the web&#8221;? Well, to me, &#8220;breaking the web&#8221; is about breaking the implicit socio-technical contract that we enter into when we publish websites. At the social level, sites break the web when they <a href="http://blog.tommorris.org/post/3512773108/channel-4-showing-the-fruits-of-content-lifecycle">withdraw support for URIs that are widely referenced elsewhere</a>, hide content behind register- or pay-walls, or discriminate against those who suffer from disabilities or low bandwidth. At the technical level, it&#8217;s when sites lie in HTTP. It&#8217;s when they serve up pages with the title &#8220;Not Found&#8221; with the HTTP status code <code>200 OK</code>. It&#8217;s when they serve non-well-formed HTML as <code>application/xhtml+xml</code>.</p>

<p>These things matter because we base our own behaviour on the contract being kept. If we cannot trust major websites to continue to support the URIs that they have coined, how can we link to them? If we cannot trust websites to provide accurate metadata about the content that they serve, how can we write applications that cache or display or otherwise use that information? On their own, pages that use Javascript-based transclusion break both the social side (in that they limit access to those with Javascript) and the technical side (in that they cannot properly use HTTP) of the contract.</p>

<p>But contracts do get rewritten over time. The web is constantly evolving and we have to revise the contract as new behaviours and new technologies gain adoption. The <code>_escaped_fragment_</code> convention gives a life line: a method of programmatically discovering how to access the version of a page without Javascript, and to discover metadata about it through HTTP. It is not a pretty pattern (I would much prefer that the server returned a header containing a <a href="http://tools.ietf.org/html/draft-gregorio-uritemplate-04">URI template</a> that described how to create the hashless equivalent of a hash-bang URI, and to have some rules about the parsing of a hash-bang fragment identifier so that it could include other fragments identifiers) but it has the benefit of adoption.</p>

<p>In short, hash-bang URIs are an important pattern that will be around for several years because they offer many benefits compared to their alternatives, and because HTML5&#8217;s history API is still a little way off general support. Rather than banging the drum against hash-bang URIs, we need to try to make them work as well as they can by:</p>

<ul>
<li>berating sites that use plain hash URIs for transcluded content</li>
<li>encouraging sites that use hash-bang URIs to follow some good practices such as those I outlined above</li>
<li>encouraging applications, such as browsers and search engines, to automatically map hash-bang URIs into the <code>_escaped_fragment_</code> pattern when they do not have Javascript available</li>
</ul>

<p>We also need to keep a close eye on emerging patterns in distributed web applications to ensure that these efforts are supported in the standards on which the web is built.</p>

                           </div>
                           <div id="more" class="entry-more">
                              

                           </div>
                       <p class="postinfo">Filed by <a href="">Jeni Tennison</a> on May 12, 2011  6:17 PM in <a href="http://www.w3.org/QA/archive/web_architecture/">Web Architecture</a><br />
<span class="separator">|</span> <a class="permalink" href="http://www.w3.org/QA/2011/05/hash_uris.html">Permalink</a>
                                 | <a href="http://www.w3.org/QA/2011/05/hash_uris.html#comments">Comments (4)</a>
                                 | <a href="http://www.w3.org/QA/2011/05/hash_uris.html#trackback">TrackBacks (0)</a>
</p>



<h3 class="comments-header" id="comments">Comments</h3>
<div class="comment" id="comment-228039">
<p class="comment-meta" id="c228039">
<span class="comment-meta-author"><strong>Josh T. </strong></span>
<span class="comment-meta-date"><a href="#c228039">#</a> 2011-05-13</span>
</p>
<div class="comment-bulk">
<p>I think it would be better if this was done in the HTML rather than scripts. For example...</p>

<p></p>

<p>
<a href="#!foo" rel="nofollow">foo</a>
<a href="#!bar" rel="nofollow">bar</a>
</p>

<p>

</p>

<p>Your browser does not support hashbangs.
</p>

<p>Of course, this wouldn't work for more complicated things. There's probably a better way of doing it.</p>

</div>
</div>


<div class="comment" id="comment-228046">
<p class="comment-meta" id="c228046">
<span class="comment-meta-author"><strong>John Thomas </strong></span>
<span class="comment-meta-date"><a href="#c228046">#</a> 2011-05-14</span>
</p>
<div class="comment-bulk">
<p>Another issue with hash uri's is it breaks the refresh button. It's frustrating to wonder why this page isn't updating until you realize there's a hash at the end of the uri</p>

</div>
</div>


<div class="comment" id="comment-228185">
<p class="comment-meta" id="c228185">
<span class="comment-meta-author"><strong>Steve S </strong></span>
<span class="comment-meta-date"><a href="#c228185">#</a> 2011-05-20</span>
</p>
<div class="comment-bulk">
<p>I'm a website hosting company [redacted] and I see people buying domain names with hash's in the URL's for just one purpose.  To get good ranking and spam customers with there google ads or affiliate program.  They are just filling google with junk websites.  I would love to see them go away.</p>

</div>
</div>


<div class="comment" id="comment-547675">
<p class="comment-meta" id="c547675">
<span class="comment-meta-author"><strong>jmarranz </strong></span>
<span class="comment-meta-date"><a href="#c547675">#</a> 2011-10-09</span>
</p>
<div class="comment-bulk">
<p>First of all, thanks for linking the Single Page Interface Manifesto</p>

<p>I like most of your article, it is one of the best articles in this subject, said this, I don't agree with some of your "Disadvantages" of hash based URIs:</p>

<ul>
<li><p>Restricted Access aka "don't work when JavaScript is disabled.</p>

<p>Take a look to this example, is a complex and real world web site, it is SPI and also works with JS disabled:</p>

<p><a href="http://www.innowhere.com:8080/insites/" rel="nofollow">http://www.innowhere.com:8080/insites/</a></p>

<p>It uses a Java web framework which makes very easy "dual web sites" (SPI and page based in the same time), anyway some ideas can be ported to other frameworks. </p></li>
<li><p>Brittleness</p>

<p>It was a Gawker JavaScript bug, we're accustomed to see half-working web sites with tons of JS bugs, yes in a SPI centric world we must be more careful because it can fully break your site, it was just a bug, when the bug is fixed end of the problem.</p></li>
<li><p>Replacing HTTP</p>

<p>No, you're not replacing HTTP, you're just leveraging the web to something beyond a "linked collection of scientific documents", the web is being used these days for things very different to the original purpose of web as it was born in CERN.</p>

<p>Currently, for me, <a href="http://twitter.com/#!jmarranz" rel="nofollow">http://twitter.com/#!jmarranz</a> is so familiar as <a href="http://twitter.com/jmarranz" rel="nofollow">http://twitter.com/jmarranz</a>  </p>

<p>I just need to fix bookmarking when JS is disabled, to fix this problem I need the hash  sent to the server to return the right initial content, when I get this "minor" problem fixed we can achieve the SPI utopia.</p></li>
</ul>

</div>
</div>



  <div class="comments-open" id="comments-open">
<h3 class="comments-open-header">Leave a comment</h3>

<div class="comments-open-moderated">
   <p>
   Note: this blog is intended to foster <strong>polite
   on-topic discussions</strong>. Comments failing these
   requirements and spam will not get published. Please,
   enter your real name and email address. Every
   individual comment is reviewed by the W3C staff.
   This may take some time, thank you for your patience.
   </p>
   <p>
   You can use the following HTML markup (a href, b, i, 
   br/, p, strong, em, ul, ol, li, blockquote, pre) 
   and/or <a href="http://daringfireball.net/projects/markdown/syntax">Markdown syntax</a>.</p>
</div>

<div id="comments-open-data">
<form method="post" action="http://www.w3.org/QA/sununga/beach.pl" id="comments-form">
<h4>Your comment</h4>
<div id="comments-open-text">
  <textarea id="comment-text" name="text" rows="20" cols="100"></textarea><br />
<label for="comment-text">Write your comment text here. Remember, keep the discussion on topic and courteous.</label>
</div>

<h4>About you</h4>
<div id="comment-form-name">
  <input type="hidden" name="static" value="1" />
<input type="hidden" name="entry_id" value="9092" />
<input type="hidden" name="__lang" value="en" /> 
<label for="comment-author">Your Name</label>
<input id="comment-author" name="author" size="30" value="" />
</div>
<div id="comment-form-email">
<label for="comment-email">Your Email Address</label>
<input id="comment-email" name="email" size="30" value="" />
</div>

<div id="comments-open-footer">
<input type="submit" accesskey="s" name="post" id="comment-submit" value="Submit" />

</div>
</form>
</div>
</div>



<p id="gentime">This page was last generated on $Date: 2011/12/16 03:00:16 $</p> 

      </div><!-- End of "main" DIV. -->

<address>

This blog is written by W3C staff and working group participants,<br />
&nbsp;and maintained by <a href="/People/CMercier/">Coralie Mercier</a>.<br />
Authorized parties may <a href="/QA/new">log in</a> to create a new entry.<br/>
<span id="poweredby">Powered by Movable Type, magpierss and a lot of Web Technology</span>
    </address>


    
    <p class="copyright">
      <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &copy; 1994-2011
      <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a>&reg;
      (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>,
      <a href="http://www.ercim.eu/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>,
      <a href="http://www.keio.ac.jp/">Keio</a>),
      All Rights Reserved.
      W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>,
      <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a>,
      <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a>
      and <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/copyright-software">software licensing</a>
      rules apply. Your interactions with this site are in accordance
      with our <a href="http://www.w3.org/Consortium/Legal/privacy-statement#Public">public</a> and
      <a href="http://www.w3.org/Consortium/Legal/privacy-statement#Members">Member</a> privacy
      statements.
    </p>

  </body>
</html>