<?xml version="1.0" encoding="utf-8"?>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<?rfc toc="yes"?>
<?rfc symrefs="yes"?>
<?rfc sortrefs="yes"?>
<?rfc compact="yes"?>
<?rfc comments="yes"?>
<?rfc inline="yes"?>
<?rfc subcompact="no"?>
<?rfc rfcedstyle="yes"?>
<?rfc-ext allow-markup-in-artwork="yes" ?>

<!DOCTYPE rfc [
  <!ENTITY MAY "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>MAY</bcp14>">
  <!ENTITY MUST "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>MUST</bcp14>">
  <!ENTITY MUST-NOT "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>MUST NOT</bcp14>">
  <!ENTITY OPTIONAL "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>OPTIONAL</bcp14>">
  <!ENTITY RECOMMENDED "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>RECOMMENDED</bcp14>">
  <!ENTITY REQUIRED "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>REQUIRED</bcp14>">
  <!ENTITY SHALL "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>SHALL</bcp14>">
  <!ENTITY SHALL-NOT "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>SHALL NOT</bcp14>">
  <!ENTITY SHOULD "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>SHOULD</bcp14>">
  <!ENTITY SHOULD-NOT "<bcp14 xmlns='http://purl.org/net/xml2rfc/ext'>SHOULD NOT</bcp14>">
]>

<rfc xmlns:x="http://purl.org/net/xml2rfc/ext" xmlns:ed="http://greenbytes.de/2002/rfcedit" ipr="trust200902" docName="draft-reschke-ref-parsing-latest" category="info" xml:lang="en">
	<front>
  <title abbrev="Parsing URI/IRI references">Processing potentially invalid URI and IRI References</title>
  <author initials="J. F." surname="Reschke" fullname="Julian F. Reschke">
    <organization abbrev="greenbytes">greenbytes GmbH</organization>
    <address>
      <postal>
        <street>Hafenweg 16</street>
        <city>Muenster</city><region>NW</region><code>48155</code>
        <country>Germany</country>
      </postal>
      <email>julian.reschke@greenbytes.de</email>	
      <uri>http://greenbytes.de/tech/webdav/</uri>	
    </address>
  </author>

  <date month="July" year="2011"/>
  
  <abstract>
    <t>
      The parsing of Uniform Resource Identifiers (URIs, RFC 3986) and 
      Internationalized Resource Identifiers (IRIs, RFC 3987) is defined
      in terms of Augmented Backus-Naur Form (ABNF). The ABNF grammars
      are defined in terms of valid identifiers, and thus technically do not
      address how to handle invalid ones.
    </t>
    <t>
      The URI specification however includes a note how to use Regular
      Expressions for parsing, and this note applies to invalid identifiers
      as well. This document introduces terminology referring to potentially
      invalid identifiers, and demonstrates how the rules in the URI
      specification can be applied to them.
    </t>
  </abstract>
  
  <note title="Editorial Note (To be removed by RFC Editor before publication)">
    <t>
      Distribution of this document is unlimited. Although this is not a work
      item of the IRI Working Group, comments should be sent to the 
      IRI mailing list at <eref target="mailto:public-iri@w3.org">public-iri@w3.org</eref>,
      which may be joined by sending a message with subject 
      "subscribe" to <eref target="mailto:public-iri-request@w3.org?subject=subscribe">public-iri-request@w3.org</eref>.
    </t>
    <t>
      Discussions of the IRI Working Group are archived at
      <eref target="http://lists.w3.org/Archives/Public/public-iri/"/>.               
    </t> 
    <t>
      XML versions, latest edits, and the issues list for this document
      are available from <eref target="http://greenbytes.de/tech/webdav/#draft-reschke-ref-parsing"/>.
    </t>
  </note>
  </front>

  <middle>


<ed:issue name="edit" type="edit" status="open">
  <ed:item entered-by="julian.reschke@greenbytes.de" date="2011-07-02">
    Umbrella issue for editorial fixes/enhancements.
  </ed:item>
</ed:issue>
<ed:issue name="iri" type="change" status="open">
  <ed:item entered-by="julian.reschke@greenbytes.de" date="2011-07-02">
    Expand for IRIs.
  </ed:item>
</ed:issue>
<ed:issue name="proc" type="change" status="proc">
  <ed:item entered-by="julian.reschke@greenbytes.de" date="2011-07-02">
    Re-state the parsing algorithm as a procedural algorithm, maybe in JS?
  </ed:item>
  <ed:item entered-by="derhoermi@gmx.net" date="2011-07-03">
We can turn the regular expression into a concise ABNF grammar if that
helps, we might also adapt it if that is found to be necessary, but I do
not see a reason why hundreds of lines of prose code or JavaScript code
would help (and anyone who would like to have that anyway can easily de-
rive it from the expression or from a grammar).
  </ed:item>
</ed:issue>
<ed:issue name="pre" type="change" status="proc">
  <ed:item entered-by="julian.reschke@greenbytes.de" date="2011-07-02">
    Define pre-processing steps for extraction of candidate references
    from content (WS stripping)?
  </ed:item>
</ed:issue>
<ed:issue name="post" type="change" status="proc">
  <ed:item entered-by="julian.reschke@greenbytes.de" date="2011-07-02">
    Define post-processing steps, such as query component rewriting based
    on document encoding.
  </ed:item>
</ed:issue>



<section title="Introduction" anchor="introduction">
  <t>
    The parsing of Uniform Resource Identifiers (URIs, <xref target="RFC3986"/>) and 
    Internationalized Resource Identifiers (IRIs, <xref target="RFC3987"/>) is defined
    in terms of Augmented Backus-Naur Form (ABNF). The ABNF grammars
    are defined in terms of valid identifiers, and thus technically do not
    address how to handle invalid ones.
  </t>
  <t>
    The URI specification however includes a note how to use Regular
    Expressions for parsing, and this note applies to invalid identifiers
    as well. This document introduces terminology referring to potentially
    invalid identifiers, and demonstrates how the rules in the URI
    specification can be applied to them.
  </t>
</section>  

<!--<section title="Notational Conventions">
<t>
  The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
  "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document
  are to be interpreted as described in <xref target="RFC2119"/>.
</t>
<t>
  This specification uses the ABNF (Augmented Backus-Naur Form) notation defined in
  <xref target="RFC5234"/>. The following core rules are included by
  reference, as defined in <xref target="RFC5234" x:fmt="," x:sec="B.1"/>:
  ALPHA (letters), DIGIT (decimal 0-9), HEXDIG (hexadecimal 0-9/A-F/a-f) and
  LWSP (linear white space).
</t>
</section>  -->

<section title="Terminology" anchor="terminology">
<t>
  In addition to the terms defined in the URI specification, namely
  the Syntax Components (see <xref target="RFC3986" x:fmt="of" x:sec="3"/>),
  this document defines:
</t>
<t>
  <x:dfn>Candidate URI Reference</x:dfn>
  <list>
    <t>
      A string that may or may not be a valid URI-reference according to
      <xref target="RFC3986" x:fmt="of" x:sec="4.1"/>.
    </t>
  </list>
</t>
<t>
  <x:dfn>Candidate Scheme Component</x:dfn>
  <list>
    <t>
      A string that may or may not be a valid URI scheme component according to
      <xref target="RFC3986" x:fmt="of" x:sec="3.1"/>.
    </t>
  </list>
</t>
<t>
  <x:dfn>Candidate Authority Component</x:dfn>
  <list>
    <t>
      A string that may or may not be a valid URI authority component according to
      <xref target="RFC3986" x:fmt="of" x:sec="3.2"/>.
    </t>
  </list>
</t>
<t>
  <x:dfn>Candidate Path Component</x:dfn>
  <list>
    <t>
      A string that may or may not be a valid URI path component according to
      <xref target="RFC3986" x:fmt="of" x:sec="3.3"/>.
    </t>
  </list>
</t>
<t>
  <x:dfn>Candidate Query Component</x:dfn>
  <list>
    <t>
      A string that may or may not be a valid URI query component according to
      <xref target="RFC3986" x:fmt="of" x:sec="3.4"/>.
    </t>
  </list>
</t>
<t>
  <x:dfn>Candidate Fragment Component</x:dfn>
  <list>
    <t>
      A string that may or may not be a valid URI fragment component according to
      <xref target="RFC3986" x:fmt="of" x:sec="3.5"/>.
    </t>
  </list>
</t>
</section>

<section title="Processing">
<section title="Parsing a Candidate URI Reference into Components" anchor="parsing">
<t>
  The regular expression given in <xref target="RFC3986" x:fmt="of" x:sec="B"/>
  will parse any input string into a Candidate Scheme Component, a Candidate Authority
  Component, a Candidate Path Component, a Candidate Query Component, and a 
  Candidate Fragment Component. Note that of these five components, all
  components except for the Path Component can be undefined.
</t>
<t>
  If each of the defined components is valid according to the related URI component
  definition, the input was a valid URI reference.
</t>
</section>

<section title="Resolution of Candidate References">
<ed:issue xmlns="http://www.w3.org/1999/xhtml" name="combine-valid" type="change" status="open" href="http://lists.w3.org/Archives/Public/public-iri/2011Jul/0014.html">
  <ed:item entered-by="derhoermi@gmx.net" date="2011-07-03">
In section 3.2 you have "The result will be a valid URI Reference if
and only if the components used by the algorithm were valid themselves."
I have some doubts about "only if", consider for instance removing dot
segments, which might remove a malformed part, if I recall correctly.
  </ed:item>
</ed:issue>
<t>
  <xref target="RFC3986" x:fmt="of" x:sec="5"/> defines Reference Resolution
  based on the five components. This algorithm works both for components
  obtained from valid and invalid references. The result will be a valid
  URI Reference if and only if the components used by the algorithm were valid
  themselves.
</t>
</section>

</section>

<section title="Security Considerations" anchor="security.considerations">
<t>
  <cref>TBD</cref>
</t>
</section>  

<section title="IANA Considerations" anchor="iana.considerations">
<t>
  There are no IANA Considerations related to this specification.
</t>
</section>  

<!--<section title="Acknowledgements">
<t>
</t>
</section>-->  
  </middle>
  <back>
  
<references title="Normative References">
  
<!--  <reference anchor="RFC2119">
    <front>
      <title abbrev="RFC Key Words">Key words for use in RFCs to Indicate Requirement Levels</title>
      <author initials="S." surname="Bradner" fullname="Scott Bradner">
        <organization>Harvard University</organization>
        <address><email>sob@harvard.edu</email></address>
      </author>
      <date month="March" year="1997"/>
    </front>
    <seriesInfo name="BCP" value="14"/>
    <seriesInfo name="RFC" value="2119"/>
  </reference>-->
  
  <reference anchor="RFC3986">
   <front>
    <title abbrev='URI Generic Syntax'>Uniform Resource Identifier (URI): Generic Syntax</title>
    <author initials='T.' surname='Berners-Lee' fullname='Tim Berners-Lee'>
      <organization abbrev="W3C/MIT">World Wide Web Consortium</organization>
      <address>
         <email>timbl@w3.org</email>
      </address>
    </author>
    <author initials='R.' surname='Fielding' fullname='Roy T. Fielding'>
      <organization abbrev="Day Software">Day Software</organization>
      <address>
        <email>fielding@gbiv.com</email>
      </address>
    </author>
    <author initials='L.' surname='Masinter' fullname='Larry Masinter'>
      <organization abbrev="Adobe Systems">Adobe Systems Incorporated</organization>
      <address>
        <email>LMM@acm.org</email>
      </address>
    </author>
    <date month='January' year='2005'></date>
   </front>
   <seriesInfo name="STD" value="66"/>
   <seriesInfo name="RFC" value="3986"/>
  </reference>

</references>

<references title="Informative References">

  <reference anchor="RFC3987">
    <front>
      <title>Internationalized Resource Identifiers (IRIs)</title>
      <author initials="M." surname="Duerst" fullname="M. Duerst" />
      <author initials="M." surname="Suignard" fullname="M. Suignard" />
      <date year="2005" month="January"/>
    </front>
    <seriesInfo name="RFC" value="3987"/>
  </reference>

</references>


<section title="Implementations">
<t>
  <eref target="http://greenbytes.de/tech/tc/uris/"/> shows results for the
  parsing/resolution processing described above, based on a test implementation
  written in XSLT 2.0.
</t>
</section>

<section title="Change Log (to be removed by RFC Editor before publication)" anchor="change.log">
<section title="Since draft-reschke-ref-parsing-location-00">
<t>
  Added issue <ed:issueref>combine-valid</ed:issueref>.
</t>
</section>
</section>



  </back>

</rfc>