Choosing_an_HTML_Data_Format 41.9 KB

Raw Blame History Permalink

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" dir="ltr">
	<head>
		<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
		<meta http-equiv="Content-Style-Type" content="text/css" />
		<meta name="generator" content="MediaWiki 1.15.5" />
		<meta name="keywords" content="Choosing an HTML Data Format,HTML Data Vocabularies,Html-data-tf,Mapping Microdata to RDF,Mixing HTML Data Formats" />
		<link rel="shortcut icon" href="/favicon.ico" />
		<link rel="search" type="application/opensearchdescription+xml" href="/wiki/opensearch_desc.php" title="W3C Wiki (en)" />
		<link rel="alternate" type="application/rss+xml" title="W3C Wiki RSS feed" href="/wiki/index.php?title=Special:RecentChanges&amp;feed=rss" />
		<link rel="alternate" type="application/atom+xml" title="W3C Wiki Atom feed" href="/wiki/index.php?title=Special:RecentChanges&amp;feed=atom" />
		<title>Choosing an HTML Data Format - W3C Wiki</title>
		<link rel="stylesheet" href="/wiki/skins/common/shared.css?207" type="text/css" media="screen" />
		<link rel="stylesheet" href="/wiki/skins/common/commonPrint.css?207" type="text/css" media="print" />
		<link rel="stylesheet" href="/wiki/skins/w3cmonobook/main.css?207" type="text/css" media="screen" />
		<!--[if lt IE 5.5000]><link rel="stylesheet" href="/wiki/skins/w3cmonobook/IE50Fixes.css?207" type="text/css" media="screen" /><![endif]-->
		<!--[if IE 5.5000]><link rel="stylesheet" href="/wiki/skins/w3cmonobook/IE55Fixes.css?207" type="text/css" media="screen" /><![endif]-->
		<!--[if IE 6]><link rel="stylesheet" href="/wiki/skins/w3cmonobook/IE60Fixes.css?207" type="text/css" media="screen" /><![endif]-->
		<!--[if IE 7]><link rel="stylesheet" href="/wiki/skins/w3cmonobook/IE70Fixes.css?207" type="text/css" media="screen" /><![endif]-->
		<link rel="stylesheet" href="/wiki/index.php?title=MediaWiki:Common.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=18000&amp;action=raw&amp;maxage=18000" type="text/css" />
		<link rel="stylesheet" href="/wiki/index.php?title=MediaWiki:Print.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=18000&amp;action=raw&amp;maxage=18000" type="text/css" media="print" />
		<link rel="stylesheet" href="/wiki/index.php?title=MediaWiki:W3cmonobook.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=18000&amp;action=raw&amp;maxage=18000" type="text/css" />
		<link rel="stylesheet" href="/wiki/index.php?title=-&amp;action=raw&amp;maxage=18000&amp;gen=css" type="text/css" />
		<!--[if lt IE 7]><script type="text/javascript" src="/wiki/skins/common/IEFixes.js?207"></script>
		<meta http-equiv="imagetoolbar" content="no" /><![endif]-->

		<script type= "text/javascript">/*<![CDATA[*/
		var skin = "w3cmonobook";
		var stylepath = "/wiki/skins";
		var wgArticlePath = "/wiki/$1";
		var wgScriptPath = "/wiki";
		var wgScript = "/wiki/index.php";
		var wgVariantArticlePath = false;
		var wgActionPaths = {};
		var wgServer = "http://www.w3.org";
		var wgCanonicalNamespace = "";
		var wgCanonicalSpecialPageName = false;
		var wgNamespaceNumber = 0;
		var wgPageName = "Choosing_an_HTML_Data_Format";
		var wgTitle = "Choosing an HTML Data Format";
		var wgAction = "view";
		var wgArticleId = "6021";
		var wgIsArticle = true;
		var wgUserName = null;
		var wgUserGroups = null;
		var wgUserLanguage = "en";
		var wgContentLanguage = "en";
		var wgBreakFrames = false;
		var wgCurRevisionId = 55735;
		var wgVersion = "1.15.5";
		var wgEnableAPI = true;
		var wgEnableWriteAPI = true;
		var wgSeparatorTransformTable = ["", ""];
		var wgDigitTransformTable = ["", ""];
		var wgRestrictionEdit = [];
		var wgRestrictionMove = [];
		/*]]>*/</script>

		<script type="text/javascript" src="/wiki/skins/common/wikibits.js?207"><!-- wikibits js --></script>
		<!-- Head Scripts -->
		<script type="text/javascript" src="/wiki/skins/common/ajax.js?207"></script>
		<link rel="alternate" type="application/rdf+xml" title="Choosing an HTML Data Format" href="/wiki/index.php?title=Special:ExportRDF/Choosing_an_HTML_Data_Format&amp;xmlmime=rdf" />
		<script type="text/javascript" src="/wiki/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=w3cmonobook"><!-- site js --></script>
	</head>
<body class="mediawiki ltr ns-0 ns-subject page-Choosing_an_HTML_Data_Format skin-w3cmonobook">
	<div id="globalWrapper">
		<div id="column-content">
	<div id="content">
		<a id="top"></a>
				<h1 id="firstHeading" class="firstHeading">Choosing an HTML Data Format</h1>
		<div id="bodyContent">
			<h3 id="siteSub">From W3C Wiki</h3>
			<div id="contentSub"></div>
									<div id="jump-to-nav">Jump to: <a href="#column-one">navigation</a>, <a href="#searchInput">search</a></div>			<!-- start content -->
			<p>This page describes the recommendations of the <a href="/wiki/Html-data-tf" title="Html-data-tf">HTML Data TF</a> regarding how you choose which data format to use to embed data within your HTML pages, if you are a data publisher, or target for consumption, if you are a consumer.
</p>
<table id="toc" class="toc" summary="Contents"><tr><td><div id="toctitle"><h2>Contents</h2></div>
<ul>
<li class="toclevel-1"><a href="#Publishers"><span class="tocnumber">1</span> <span class="toctext">Publishers</span></a>
<ul>
<li class="toclevel-2"><a href="#Choosing_a_Publishing_Format"><span class="tocnumber">1.1</span> <span class="toctext">Choosing a Publishing Format</span></a>
<ul>
<li class="toclevel-3"><a href="#Syntax_Considerations"><span class="tocnumber">1.1.1</span> <span class="toctext">Syntax Considerations</span></a></li>
<li class="toclevel-3"><a href="#Vocabulary_Considerations"><span class="tocnumber">1.1.2</span> <span class="toctext">Vocabulary Considerations</span></a></li>
<li class="toclevel-3"><a href="#Usability_Considerations"><span class="tocnumber">1.1.3</span> <span class="toctext">Usability Considerations</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="#Publishing_in_Multiple_Formats"><span class="tocnumber">1.2</span> <span class="toctext">Publishing in Multiple Formats</span></a></li>
<li class="toclevel-2"><a href="#Good_Publishing_Practice"><span class="tocnumber">1.3</span> <span class="toctext">Good Publishing Practice</span></a></li>
</ul>
</li>
<li class="toclevel-1"><a href="#Consumers"><span class="tocnumber">2</span> <span class="toctext">Consumers</span></a>
<ul>
<li class="toclevel-2"><a href="#Choosing_a_Syntax_to_Consume"><span class="tocnumber">2.1</span> <span class="toctext">Choosing a Syntax to Consume</span></a>
<ul>
<li class="toclevel-3"><a href="#Tooling_Considerations"><span class="tocnumber">2.1.1</span> <span class="toctext">Tooling Considerations</span></a>
<ul>
<li class="toclevel-4"><a href="#microdata.2Fmicroformats-2_data_model"><span class="tocnumber">2.1.1.1</span> <span class="toctext">microdata/microformats-2 data model</span></a></li>
<li class="toclevel-4"><a href="#RDF_data_model"><span class="tocnumber">2.1.1.2</span> <span class="toctext">RDF data model</span></a></li>
</ul>
</li>
<li class="toclevel-3"><a href="#Data_Model_Considerations"><span class="tocnumber">2.1.2</span> <span class="toctext">Data Model Considerations</span></a></li>
<li class="toclevel-3"><a href="#Usability_Considerations_2"><span class="tocnumber">2.1.3</span> <span class="toctext">Usability Considerations</span></a></li>
</ul>
</li>
<li class="toclevel-2"><a href="#Good_Consumption_Practice"><span class="tocnumber">2.2</span> <span class="toctext">Good Consumption Practice</span></a></li>
</ul>
</li>
</ul>
</td></tr></table><script type="text/javascript"> if (window.showTocToggle) { var tocShowText = "show"; var tocHideText = "hide"; showTocToggle(); } </script>
<a name="Publishers" id="Publishers"></a><h2> <span class="mw-headline"> Publishers </span></h2>
<p>You are likely to find that the markup within your pages is simpler and easier to maintain if you only use one format (syntax and vocabulary) within each page. To decide which to use, your first consideration has to be which consumers will read the data within your web pages, and which formats they support. These may include:
</p>
<ul><li> scripting libraries
</li><li> browsers and browser plug-ins
</li><li> general-purpose search engines
</li><li> vertical or domain-specific search engines
</li><li> data reusers with whom you have agreements
</li></ul>
<p>Your second consideration may be the current state of the tooling to support a particular format. For example:
</p>
<dl><dt>Are you able to publish using HTML5?
</dt><dd>If you are using a content-management system that doesn't support adding new attributes such as <code>@itemprop</code> or <code>@typeof</code> or if your publishing guidelines require validity against an older version of HTML, then you will be constrained to using microformats. If your publishing guidelines require validity against XHTML, then you might be able to use XHTML+RDFa, depending on how precise your publishing guidelines are.
</dd><dt>Are there development tools available?
</dt><dd>Because it is not visible within a web page, it can be hard to tell whether HTML data has been written correctly. Consumers should provide validators that enable you to check that your data has been correctly detected and interpreted, but you may also want to consider tool support for generating the HTML data.
</dd></dl>
<p>Once you have considered both your target consumers and the tooling support that is available, you will be in one of four situations:
</p>
<ol><li> <b>with a single choice of format</b> in which case you are good to go
</li><li> <b>unable to publish HTML data that your target consumers understand</b> in which case you either have to lobby those consumers to add support for the format(s) you can publish in, or consider changing your toolset so that you can publish in something they understand
</li><li> <b>still with a choice between a number of formats</b> in which case you will want to pick one (see below)
</li><li> <b>having to use multiple formats at the same time to provide data to all your target customers</b> in which case you will need to mix formats within your pages (see below)
</li></ol>
<a name="Choosing_a_Publishing_Format" id="Choosing_a_Publishing_Format"></a><h3> <span class="mw-headline"> Choosing a Publishing Format </span></h3>
<p>This section addresses a situation where all your target consumers recognise a set of formats (each with a particular syntax and vocabulary), your toolset supports publishing in all of them, and you need to make a choice about which of these formats to use. It's assumed that you will want to choose a single format rather than <a href="/wiki/Mixing_HTML_Data_Formats" title="Mixing HTML Data Formats">mixing multiple formats</a>, as this will mean less markup in your page and make your publishing task easier.
</p>
<a name="Syntax_Considerations" id="Syntax_Considerations"></a><h4> <span class="mw-headline"> Syntax Considerations </span></h4>
<p>The different syntaxes -- microformats, microdata and RDFa -- have different capabilities which may inform your choice.
</p>
<dl><dt>Structured HTML values
</dt><dd>Under appropriate conditions, RDFa and microformats will use markup within the content of an element to provide a property value; in microdata values never retain markup. If property values within your page contain markup (for example <code>description</code>s containing emphasised text, multiple paragraphs, tables and so on), you may want to use RDFa or microformats to ensure that structure is available to consumers of your pages. In RDFa, this is done through adding <code>datatype="rdf:XMLLiteral"</code> to the relevant element. In microformats, the handling of the content of an element is determined by the property; in microformats-2, those that retain the HTML structure are named with a <code>e-*</code> prefix, such as <code>e-content</code>.
</dd><dt>Language support
</dt><dd>Microformats and RDFa use the language of the HTML elements in the page (from the <code>lang</code> attribute) to indicate the language of relevant values. In microdata, the vocabulary has to provide a separate mechanism to indicate a language (pending resolution of <a href="http://www.w3.org/Bugs/Public/show_bug.cgi?id=14470" class="external text" title="http://www.w3.org/Bugs/Public/show_bug.cgi?id=14470" rel="nofollow">bug 14470</a>). If you have multi-lingual information in your pages, you may find it easier to use microformats or RDFa than microdata.
</dd><dt>CSS support
</dt><dd>Because microformats generally use classes to mark up data within an HTML page, it is easy to use CSS to style those elements based on their type. For example <code>.hcard .n { font-weight: bold; }</code> will enbolden any person's name. This is a little harder with microdata (where the selector might be something like <code>[itemtype~="<a href="http://microformats.org/profile/hcard" class="external free" title="http://microformats.org/profile/hcard" rel="nofollow">http://microformats.org/profile/hcard</a>"] [itemprop~="n"]</code>) or RDFa (where it might be <code>[typeof~="foaf:Person"] [property~="foaf:name"]</code>). If you are planning to style your page based on the data embedded within it, you may find it easier to use microformats than either microdata or RDFa; if you do style RDFa, you should plan for dependencies between your CSS documents and any prefixes used within it.
</dd></dl>
<p>TODO: Other guidelines?
</p>
<a name="Vocabulary_Considerations" id="Vocabulary_Considerations"></a><h4> <span class="mw-headline"> Vocabulary Considerations </span></h4>
<p>Vocabularies and syntaxes are closely tied together, especially in the case of microformats. Aspects of a vocabulary to bear in mind are:
</p>
<ul><li> How closely does it match with the information that you have?
</li><li> How much support does it have? Are there tools for validating and viewing it? Is there good documentation?
</li><li> How stable is it? Who has control to make changes to it? How frequently might those changes be made?
</li><li> Are other consumers likely to adopt it in the future?
</li></ul>
<a name="Usability_Considerations" id="Usability_Considerations"></a><h4> <span class="mw-headline"> Usability Considerations </span></h4>
<p>The usability of a particular format is likely to depend on your existing expertise and the match between the structure and content of your web pages and the required structure and content of the format. The best thing to do is to try using the format to mark up an example page from your site.
</p><p>TODO: Example?
</p>
<a name="Publishing_in_Multiple_Formats" id="Publishing_in_Multiple_Formats"></a><h3> <span class="mw-headline"> Publishing in Multiple Formats </span></h3>
<p>Publishing in multiple formats can be easy. For example, it may be that different consumers expect HTML data to appear in different places within the page, such as Facebook requiring Open Graph Protocol data to appear within the <code>head</code> of an HTML page, while schema.org markup appears in the <code>body</code> of the page. Or it may be that the items that you need to mark up on the page appear in different places -- events listed in a sidebar while company details are provided in a footer, for example.
</p><p>Different formats and vocabularies can be used independently in these circumstances. Consumers of the data within your pages might read additional data if it is in a syntax that they recognise -- for example, an processor that recognises both RDFa and microdata will interpret all such markup in the page -- but it should ignore information that is in a vocabulary that it doesn't understand rather than giving an error.
</p><p>Publishing can be harder when there are multiple consumers of information that require different formats. If your target consumers will all accept the same syntax, it is usually easiest to use that single syntax in your pages. However, microdata does not support multiple types for a single entity, so if your target  consumers expect different vocabularies to be used for the same entities you may find it easier to mix syntaxes or to use RDFa or microformats, which do support multiple vocabularies.
</p><p>Further techniques for mixing different syntaxes and vocabularies within a page are <a href="http://www.w3.org/wiki/Mixing_HTML_Data_Formats" class="external text" title="http://www.w3.org/wiki/Mixing_HTML_Data_Formats" rel="nofollow">provided on a separate page</a>.
</p>
<a name="Good_Publishing_Practice" id="Good_Publishing_Practice"></a><h3> <span class="mw-headline"> Good Publishing Practice </span></h3>
<p>Valid HTML is particularly important in pages that contain embedded markup. All methods of embedding data within HTML use the structure of the HTML to determine the meaning of the additional markup. For example, the item to which an element with an <code>@itemprop</code> attribute assigns a property is usually the closest ancestor element with a <code>@itemscope</code> attribute.
</p><p>In some cases, elements can be moved when HTML is parsed into a DOM. This can lead to properties unexpectedly referring to the wrong entity, and, if you are serving your documents as XHTML (with a <code>application/xhtml+xml</code> mime type), it can cause discrepancies between the data gleaned by XML-based consumers and HTML-aware consumers. There are two causes for this:
</p>
<ul><li> Error correction in HTML parsing can restructure invalid HTML is restructured to make it valid, for example non-table markup within a table is moved to before the table. This includes <code>link</code> and <code>meta</code> elements that are directly within the <code>table</code> element. You can avoid this restructuring by making sure that your HTML is valid so that it is not needed.
</li><li> Some older browsers may move <code>meta</code> and/or <code>link</code> elements in the <code>body</code> of an HTML document to within the <code>head</code> element, because they could not validly appear within the body in older versions of HTML. If you are targeting consumers which run within older browsers, such as scripts or plug-ins, you can avoid this restructuring by using empty <code>span</code> or other elements instead of <code>link</code> or <code>meta</code>; other consumers should be using an up-to-date HTML5 parser which will not do this.
</li></ul>
<p>It is good practice to test the data that you expose within your page against a parser that will show you the data your page contains. Existing online parsers include:
</p>
<dl><dt>microdata
</dt><dd><a href="http://foolip.org/microdatajs/live/" class="external text" title="http://foolip.org/microdatajs/live/" rel="nofollow">Live Microdata</a> maps to <a href="http://www.w3.org/TR/microdata/#application-microdata-json" class="external text" title="http://www.w3.org/TR/microdata/#application-microdata-json" rel="nofollow">JSON</a>, vCard and iCal
</dd><dd><a href="http://rdf.greggkellogg.net/distiller" class="external text" title="http://rdf.greggkellogg.net/distiller" rel="nofollow">RDF Distiller</a> maps to various RDF-based formats
</dd><dd><a href="http://any23.org" class="external text" title="http://any23.org" rel="nofollow">any23.org</a>
</dd><dt>RDFa
</dt><dd><a href="http://rdf.greggkellogg.net/distiller" class="external text" title="http://rdf.greggkellogg.net/distiller" rel="nofollow">RDF Distiller</a>
</dd><dd><a href="http://check.rdfa.info" class="external text" title="http://check.rdfa.info" rel="nofollow">check.rdfa.info</a>
</dd><dd><a href="http://www.w3.org/2007/08/pyRdfa/" class="external text" title="http://www.w3.org/2007/08/pyRdfa/" rel="nofollow">Python RDFa 1.0 Distiller</a>, or its <a href="http://www.w3.org/2007/08/pyRdfa/Shadow.html" class="external text" title="http://www.w3.org/2007/08/pyRdfa/Shadow.html" rel="nofollow">Experimental RDFa 1.1</a> version
</dd><dd><a href="http://any23.org" class="external text" title="http://any23.org" rel="nofollow">any23.org</a>
</dd><dt>microformats
</dt><dd>see below for parsers for specific microformats
</dd></dl>
<p>TODO: add more
</p><p>It is good practice to test the data that you expose using a tool that understands the vocabulary you are using. Consumers may provide testing tools and validators for this purpose, or you may need to check the way that vocabulary-specific tools behave with your data. Example vocabulary-aware testing tools and validators include:
</p>
<dl><dt><a href="http://microformats.org/wiki/hcalendar" class="external text" title="http://microformats.org/wiki/hcalendar" rel="nofollow">hCalendar</a>
</dt><dd><a href="http://www.google.com/webmasters/tools/richsnippets" class="external text" title="http://www.google.com/webmasters/tools/richsnippets" rel="nofollow">Google's Rich Snippets Testing Tool</a>
</dd><dd>see also <a href="http://microformats.org/wiki/hcalendar-implementations" class="external text" title="http://microformats.org/wiki/hcalendar-implementations" rel="nofollow">hCalendar implementations</a>
</dd><dt><a href="http://microformats.org/wiki/hcard" class="external text" title="http://microformats.org/wiki/hcard" rel="nofollow">hCard</a>
</dt><dd><a href="http://hcard.geekhood.net/" class="external text" title="http://hcard.geekhood.net/" rel="nofollow">hCard microformat Validator</a>
</dd><dd><a href="http://www.google.com/webmasters/tools/richsnippets" class="external text" title="http://www.google.com/webmasters/tools/richsnippets" rel="nofollow">Google's Rich Snippets Testing Tool</a>
</dd><dd>see also <a href="http://microformats.org/wiki/hcard-implementations" class="external text" title="http://microformats.org/wiki/hcard-implementations" rel="nofollow">hCard implementations</a>
</dd><dt><a href="http://microformats.org/wiki/hreview" class="external text" title="http://microformats.org/wiki/hreview" rel="nofollow">hReview</a>
</dt><dd><a href="http://www.google.com/webmasters/tools/richsnippets" class="external text" title="http://www.google.com/webmasters/tools/richsnippets" rel="nofollow">Google's Rich Snippets Testing Tool</a>
</dd><dd>see also <a href="http://microformats.org/wiki/hreview-implementations" class="external text" title="http://microformats.org/wiki/hreview-implementations" rel="nofollow">hReview implementations</a>
</dd><dt>Open Graph Protocol
</dt><dd><a href="http://check.rdfa.info" class="external text" title="http://check.rdfa.info" rel="nofollow">check.rdfa.info</a>
</dd><dt><a href="http://schema.org" class="external text" title="http://schema.org" rel="nofollow">schema.org</a>
</dt><dd><a href="http://www.google.com/webmasters/tools/richsnippets" class="external text" title="http://www.google.com/webmasters/tools/richsnippets" rel="nofollow">Google's Rich Snippets Testing Tool</a>
</dd><dd><a href="http://check.rdfa.info" class="external text" title="http://check.rdfa.info" rel="nofollow">check.rdfa.info</a>
</dd><dd><a href="http://linter.structured-data.org" class="external text" title="http://linter.structured-data.org" rel="nofollow">Structured Data Linter</a>
</dd><dt><a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#vcard" class="external text" title="http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#vcard" rel="nofollow">vCard</a>
</dt><dd><a href="http://foolip.org/microdatajs/live/" class="external text" title="http://foolip.org/microdatajs/live/" rel="nofollow">Live Microdata</a>
</dd><dt><a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#vevent" class="external text" title="http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#vevent" rel="nofollow">vEvent</a>
</dt><dd><a href="http://foolip.org/microdatajs/live/" class="external text" title="http://foolip.org/microdatajs/live/" rel="nofollow">Live Microdata</a>
</dd></dl>
<p>The goal of publishing HTML data is to enable consumers to reuse it. To make it clear how the HTML data you publish can be reused, you should include information about the rights holder and license that the information is made under. There are a number of vocabularies that enable you to do this, such as schema.org, rel-license, Creative Commons and Dublin Core. Your target consumers should indicate which formats they understand when it comes to expressing licensing information and which licenses they know about, and you should choose a relevant format in the same way as you do for the core data that you are publishing.
</p><p>TODO: add more
</p>
<a name="Consumers" id="Consumers"></a><h2> <span class="mw-headline"> Consumers </span></h2>
<p>You will find it easier to consume and combine data published using a single format (syntax and vocabulary). To decide which to consume, you should first look at what formats your target publishers are currently using. It may be that these contain sufficient information for your application.
</p><p>If the publishers whom you are targeting are already publishing using multiple formats, you may want to <a href="/wiki/Mixing_HTML_Data_Formats#Consuming_Pages_with_Multiple_Formats" title="Mixing HTML Data Formats">consume from all those formats</a> in order to maximise the data that you can collect while minimising the impact on the publishers who are providing that information. If you are consuming microdata and storing the results as RDF, you should <a href="/wiki/Mapping_Microdata_to_RDF" title="Mapping Microdata to RDF">follow a standard mapping</a>.
</p><p>If current formats do not encode the information you need to the detail you need it for your application, publishers will be more likely to publish extra data for you to consume if you:
</p>
<ul><li> <a href="/wiki/HTML_Data_Vocabularies" title="HTML Data Vocabularies">extend existing common vocabularies</a> they are already using
</li><li> consume data from a syntax they already use
</li></ul>
<p>If you cannot simply extend an existing vocabulary, you will need to create your own vocabulary and choose which syntaxes to support with that vocabulary.
</p>
<a name="Choosing_a_Syntax_to_Consume" id="Choosing_a_Syntax_to_Consume"></a><h3> <span class="mw-headline"> Choosing a Syntax to Consume </span></h3>
<p>As you choose syntax, you should take into account the following considerations.
</p>
<a name="Tooling_Considerations" id="Tooling_Considerations"></a><h4> <span class="mw-headline"> Tooling Considerations </span></h4>
<p>Applications vary widely in terms of the tooling that they need. A script that runs in a publisher's page needs easy access to data through a DOM API. A crawler that creates a store of data from a set of distributed pages requires a server-side parser and good storage and querying support.
</p><p>As a consumer, you will be led by the requirements you have for your application and the experience that you have with different technology sets. It's important, however, to also consider the experience and capabilities of the publishers that are providing you with data, and which formats they will find easy to publish given their tooling. You should also consider the ease with which you can provide support tools for the format, such as validators or previewers that make it easy for publishers to tell whether they have published data correctly within their pages.
</p><p>There are several specifications that can be used to provide standard mechanisms for accessing, manipulating, querying and validating data gleaned from HTML pages. However, you should check what has been implemented in your environment: it may be that there isn't an implementation that follows a standard, but there is one that provides its own API which enables you to do what you need to do.
</p>
<a name="microdata.2Fmicroformats-2_data_model" id="microdata.2Fmicroformats-2_data_model"></a><h5> <span class="mw-headline">  microdata/microformats-2 data model </span></h5>
<p>Microdata and microformats-2 can be mapped to the same <a href="http://dev.w3.org/html5/md/Overview.html#json" class="external text" title="http://dev.w3.org/html5/md/Overview.html#json" rel="nofollow">basic (JSON) data model</a>. Processing JSON into native programming structures, in Javascript and other languages, is usually very easy. Vocabularies are usually described in specification prose rather than a formal language.
</p>
<ul><li> <a href="http://dev.w3.org/html5/md/Overview.html#microdata-dom-api" class="external text" title="http://dev.w3.org/html5/md/Overview.html#microdata-dom-api" rel="nofollow">microdata DOM API</a> &mdash; part of microdata specification (W3C Last Call Working Draft)
</li><li> <a href="http://tools.ietf.org/html/draft-zyp-json-schema-03" class="external text" title="http://tools.ietf.org/html/draft-zyp-json-schema-03" rel="nofollow">JSON Schema</a> &mdash; schema language for JSON (IETF Internet Draft)
</li></ul>
<a name="RDF_data_model" id="RDF_data_model"></a><h5> <span class="mw-headline"> RDF data model </span></h5>
<p>RDFa processors extract an RDF data model and processors can also generate <a href="/wiki/Mapping_Microdata_to_RDF" title="Mapping Microdata to RDF">RDF from microdata</a>. There are a number of standards for formally expressing RDF vocabularies and querying RDF, and drafts in progress for DOM-based manipulation of RDFa content.
</p>
<ul><li> <a href="http://www.w3.org/TR/rdfa-api/" class="external text" title="http://www.w3.org/TR/rdfa-api/" rel="nofollow">RDFa API</a> &mdash; W3C Working Draft
</li><li> <a href="http://json-ld.org/spec/latest/" class="external text" title="http://json-ld.org/spec/latest/" rel="nofollow">JSON-LD</a> &mdash; JSON representation of RDF (Unofficial Draft)
</li><li> <a href="http://www.w3.org/TR/rdf-sparql-query/" class="external text" title="http://www.w3.org/TR/rdf-sparql-query/" rel="nofollow">SPARQL</a> &mdash; query language for RDF (W3C Recommendation)
</li><li> <a href="http://www.w3.org/TR/sparql11-overview/" class="external text" title="http://www.w3.org/TR/sparql11-overview/" rel="nofollow">SPARQL 1.1</a> &mdash; W3C Working Draft
</li><li> <a href="http://www.w3.org/TR/rdf-mt/" class="external text" title="http://www.w3.org/TR/rdf-mt/" rel="nofollow">RDFS</a> &mdash; vocabulary description language for RDF (W3C Recommendation)
</li><li> <a href="http://www.w3.org/TR/owl-primer/" class="external text" title="http://www.w3.org/TR/owl-primer/" rel="nofollow">OWL</a> &mdash; ontology language for RDF (W3C Recommendation)
</li></ul>
<a name="Data_Model_Considerations" id="Data_Model_Considerations"></a><h4> <span class="mw-headline"> Data Model Considerations </span></h4>
<p>Microdata uses a JSON-based data model of a tree of objects which may be identified through a URI, with properties whose values are strings. microformats-2 uses a similar JSON-based data model of a tree of objects, but they do not have identifiers and their property values may be strings, URLs, date/times or structured HTML values. RDFa uses RDF as its data model, which is a graph of objects identified by URLs with properties whose values may be other objects, lists or literal values which can be tagged with a language or any datatype. These different models have different capabilities.
</p>
<dl><dt>Structured HTML values
</dt><dd>Under appropriate conditions, RDFa and microformats will use markup within the content of an element to provide a property value; in microdata values never retain markup. If you wish to consume data that may contain markup &mdash; be it structures such as multiple paragraphs, list items, tables, or inline markup such as emphases, links or ruby markup &mdash; you will need publishers to use RDFa or microformats to mark up that data. In RDFa, this is done by publishers adding <code>datatype="rdf:XMLLiteral"</code> to elements whose markup should be preserved. In microformats, the handling of the content of an element is determined by the property; in microformats-2, those that retain the HTML structure are named with a <code>e-*</code> prefix, such as <code>e-content</code>.
</dd><dt>Language support
</dt><dd>Microformats and RDFa use the language of the HTML elements in the page (from the <code>lang</code> attribute) to indicate the language of relevant values. In microdata, the vocabulary has to provide a separate mechanism to indicate a language (pending resolution of <a href="http://www.w3.org/Bugs/Public/show_bug.cgi?id=14470" class="external text" title="http://www.w3.org/Bugs/Public/show_bug.cgi?id=14470" rel="nofollow">bug 14470</a>). If you are consuming information about the same things from pages that use different languages, or anticipate publishers using multiple languages in their pages to describe a particular entity, you can automatically pick up the language of the content of the page if publishers use microformats or RDFa. If you consume microdata, you need to provide specific properties in your vocabulary that publishers can use to indicate the language of the content.
</dd></dl>
<a name="Usability_Considerations_2" id="Usability_Considerations_2"></a><h4> <span class="mw-headline"> Usability Considerations </span></h4>
<p>Publishing data within HTML can be a challenge for publishers, simply because the structure of the data that they publish is not immediately visible within their pages. The publishers you are targeting will have different levels of skill and experience, which may influence your choice of syntax and the way in which you design your vocabulary. If you can, you should try to work closely with a few target publishers to better understand their requirements and constraints. Experimenting with marking up a few of their existing pages will often highlight issues with both syntax and vocabulary.
</p><p>Some usability issues may be addressed by restricting the set of attributes that you instruct publishers how to use, or by restricting their location to provide more consistency. For example:
</p>
<ul><li> <a href="http://www.w3.org/2010/02/rdfa/sources/rdfa-lite/Overview-src.html" class="external text" title="http://www.w3.org/2010/02/rdfa/sources/rdfa-lite/Overview-src.html" rel="nofollow">RDFa 1.1 Lite</a> is an authoring profile of RDFa 1.1 that is sufficient for most data publishing
</li><li> most microdata markup does not require <code>@itemid</code> or <code>@itemref</code>
</li><li> constraining data markup to the <code>head</code> of an HTML document can make it easier to author and protect it from templating changes, although it also runs the risk of getting out of sync with the content of the page, increases repetition, and is hard to use for anything but flat data structures
</li></ul>
<p>Profiling microdata and RDFa is useful for documentation, but consumers should still recognise and understand the full set of syntactic constructs described by the standards. This ensures that those publishers who find that they need the more advanced constructs to mark up their pages can do so, and means that publishers can use general-purpose tools and documentation rather than just those that you provide.
</p>
<a name="Good_Consumption_Practice" id="Good_Consumption_Practice"></a><h3> <span class="mw-headline"> Good Consumption Practice </span></h3>
<p>It is good practice for a consumer to provide tools that help publishers to see how the data within their pages is interpreted by the consumer and that highlight any errors in the markup, such as invalid values or missing required properties.
</p><p>It is good practice for consumers to ignore markup that uses syntax or vocabularies that they do not understand. Properties and types in unrecognised vocabularies should be ignored by consumers.
</p><p>The presence of HTML data within a website does not imply that the data can be used without restriction. Publishers may license the information provided through HTML data, for example to restrict it to non-commercial use or to use only with attribution. It is good practice for a consumer to honour licenses and to indicate to publishers which formats they recognise for expressing licensing information within HTML pages, and which licenses they recognise as indicating that the data within the page is consumable. Typical vocabularies for expressing this information are schema.org, rel-license, Creative Commons or Dublin Core.
</p><p>Even when the use of data is unrestricted, it is good practice for consumers to record the source of the information that they use and, when republishing that data, provide metadata about the rights holder, source and license under which the information is available, using the same vocabularies as those listed above.
</p><p>TODO: More?
</p>
<!--
NewPP limit report
Preprocessor node count: 16/1000000
Post-expand include size: 0/2097152 bytes
Template argument size: 0/2097152 bytes
Expensive parser function count: 0/100
-->

<!-- Saved in parser cache with key wikidb-esw_:pcache:idhash:6021-0!1!0!!en!2!edit=0 and timestamp 20120115204851 -->
<div id='RDFa' about='http://www.w3.org/wiki/Choosing_an_HTML_Data_Format' xmlns:wiki_1='http://www.w3.org/wiki/index.php/'xmlns:wiki_1_property='http://www.w3.org/wiki/index.php/Property:'xmlns:wiki_1_category='http://www.w3.org/wiki/index.php/Category:' typeof='wiki_1_categoryHTML Data TF'>
<div property='wiki_1_property:Modification_date' content='27 November 2011 20:46:43'></div>
</div><div style='display:none' ></div><div class="printfooter">
Retrieved from "<a href="http://www.w3.org/wiki/Choosing_an_HTML_Data_Format">http://www.w3.org/wiki/Choosing_an_HTML_Data_Format</a>"</div>
			<div id='catlinks' class='catlinks'><div id="mw-normal-catlinks"><a href="/wiki/Special:Categories" title="Special:Categories">Category</a>:&#32;<span dir='ltr'><a href="/wiki/index.php?title=Category:HTML_Data_TF&amp;action=edit&amp;redlink=1" class="new" title="Category:HTML Data TF (page does not exist)">HTML Data TF</a></span></div></div>			<!-- end content -->
						<div class="visualClear"></div>
		</div>
	</div>
		</div>
		<div id="column-one">
	<div id="p-cactions" class="portlet">
		<h5>Views</h5>
		<div class="pBody">
			<ul>

				 <li id="ca-nstab-main" class="selected"><a href="/wiki/Choosing_an_HTML_Data_Format" title="View the content page [c]" accesskey="c">Page</a></li>
				 <li id="ca-talk" class="new"><a href="/wiki/index.php?title=Talk:Choosing_an_HTML_Data_Format&amp;action=edit&amp;redlink=1" title="Discussion about the content page [t]" accesskey="t">Discussion</a></li>
				 <li id="ca-viewsource"><a href="/wiki/index.php?title=Choosing_an_HTML_Data_Format&amp;action=edit" title="This page is protected.&#10;You can view its source [e]" accesskey="e">View source</a></li>
				 <li id="ca-history"><a href="/wiki/index.php?title=Choosing_an_HTML_Data_Format&amp;action=history" title="Past revisions of this page [h]" accesskey="h">History</a></li>			</ul>
		</div>
	</div>
	<div class="portlet" id="p-personal">
		<h5>Personal tools</h5>
		<div class="pBody">
			<ul>
				<li id="pt-login"><a href="/wiki/index.php?title=Special:UserLogin&amp;returnto=Choosing_an_HTML_Data_Format" title="You are encouraged to log in; however, it is not mandatory [o]" accesskey="o">Log in</a></li>
			</ul>
		</div>
	</div>
	<div class="portlet" id="p-logo">
		<a style="background-image: url(/Icons/w3c_home);" href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"></a>
	</div>
	<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
	<div class='generated-sidebar portlet' id='p-navigation'>
		<h5>Navigation</h5>
		<div class='pBody'>
			<ul>
				<li id="n-mainpage"><a href="/wiki/Main_Page" title="Visit the main page">Main Page</a></li>
				<li id="n-Browse-categories"><a href="/wiki/Special:Categories">Browse categories</a></li>
				<li id="n-recentchanges"><a href="/wiki/Special:RecentChanges" title="The list of recent changes in the wiki [r]" accesskey="r">Recent changes</a></li>
				<li id="n-Help"><a href="http://www.mediawiki.org/wiki/Help:Contents">Help</a></li>
			</ul>
		</div>
	</div>
	<div id="p-search" class="portlet">
		<h5><label for="searchInput">Search</label></h5>
		<div id="searchBody" class="pBody">
			<form action="/wiki/index.php" id="searchform"><div>
				<input type='hidden' name="title" value="Special:Search"/>
				<input id="searchInput" name="search" type="text" title="Search W3C Wiki [f]" accesskey="f" value="" />
				<input type='submit' name="go" class="searchButton" id="searchGoButton"	value="Go" title="Go to a page with this exact name if exists" />&nbsp;
				<input type='submit' name="fulltext" class="searchButton" id="mw-searchButton" value="Search" title="Search the pages for this text" />
			</div></form>
		</div>
	</div>
	<div class="portlet" id="p-tb">
		<h5>Toolbox</h5>
		<div class="pBody">
			<ul>
				<li id="t-whatlinkshere"><a href="/wiki/Special:WhatLinksHere/Choosing_an_HTML_Data_Format" title="List of all wiki pages that link here [j]" accesskey="j">What links here</a></li>
				<li id="t-recentchangeslinked"><a href="/wiki/Special:RecentChangesLinked/Choosing_an_HTML_Data_Format" title="Recent changes in pages linked from this page [k]" accesskey="k">Related changes</a></li>
<li id="t-specialpages"><a href="/wiki/Special:SpecialPages" title="List of all special pages [q]" accesskey="q">Special pages</a></li>
				<li id="t-print"><a href="/wiki/index.php?title=Choosing_an_HTML_Data_Format&amp;printable=yes" rel="alternate" title="Printable version of this page [p]" accesskey="p">Printable version</a></li>				<li id="t-permalink"><a href="/wiki/index.php?title=Choosing_an_HTML_Data_Format&amp;oldid=55735" title="Permanent link to this revision of the page">Permanent link</a></li><li id="t-smwbrowselink"><a href="/wiki/Special:Browse/Choosing_an_HTML_Data_Format" title="Special:Browse/Choosing an HTML Data Format">Browse properties</a></li>			</ul>
		</div>
	</div>
		</div><!-- end of the left (by default at least) column -->
			<div class="visualClear"></div>
			<div id="footer">
				<div id="f-poweredbyico"><a href="http://www.mediawiki.org/"><img src="/wiki/skins/common/images/poweredby_mediawiki_88x31.png" alt="Powered by MediaWiki" /></a></div>
			<ul id="f-list">
					<li id="lastmod"> This page was last modified on 27 November 2011, at 20:50.</li>
					<li id="viewcount">This page has been accessed 8,375 times.</li>
					<li id="privacy"><a href="/wiki/W3C_Wiki:Privacy_policy" title="W3C Wiki:Privacy policy">Privacy policy</a></li>
					<li id="about"><a href="/wiki/W3C_Wiki:About" title="W3C Wiki:About">About W3C Wiki</a></li>
					<li id="disclaimer"><a href="/wiki/W3C_Wiki:General_disclaimer" title="W3C Wiki:General disclaimer">Disclaimers</a></li>
			</ul>
		</div>
</div>

		<script type="text/javascript">if (window.runOnloadHook) runOnloadHook();</script>
<!-- Served in 0.248 secs. --></body></html>