Commit c3c45f29 authored by edepuiff's avatar edepuiff
Browse files

Lexique_to_hdoc: gestion des ressources

Merge branch 'master' of https://gitlab.utc.fr/crozatst/hdoc
parents 841d69c9 94f77a55
This diff is collapsed.
# HDOC CONVERTER PROJECT
# Hdoc Converter Projet
License GPL3.0
http://www.gnu.org/licenses/gpl-3.0.txt
Credits :
Université de Technologie de Compiègne (http://www.utc.fr)
NF29 students (http://www4.utc.fr/~nf29)
## What is Hdoc ?
Please refer to the [Hdoc converter project website](http://hdoc.crzt.fr/co/hdocConverter.html)
Please refer to the Hdoc Converter Project website:
*Extract*
>The aim of the project is to propose:
>- a generic XML schema based on XHTML5 for documentary purpose (Hdoc format);
>- a set of converters to transform document formats from and to Hdoc;
>- a web site to manage the converters (Hdoc Converter Portal).
http://hdoc.crzt.fr
## What is this repository ?
This repository gathers some of the Hdoc converters, if not all of them.
Project URL : https://gitlab.utc.fr/crozatst/hdoc.git
## How to use Hdoc Converters ?
This repository gathers some of the Hdoc converters, if not all of them.
\ No newline at end of file
In order to use a converter, choose the corresponding folder and consult README.md for instructions.
# antce
"antce" is not for use, it is just a base for autonomous multi-OS ANT launcher
# HDOC CONVERTER PROJECT
\ No newline at end of file
# Converter etherpad_to_hdoc
## License
License GPL3.0
http://www.gnu.org/licenses/gpl-3.0.txt
## Crédits
- 2015
- Jean-Côme Douteau
- Gabrielle Rit
- Jean Vintache
- 2014
- Fecherolle Cécile
## Presentation
This module is able to convert several [etherpad](http://etherpad.org/) files (exported as html files) to the hdoc format.
## User documentation
### Running etherpad_to_hdoc.ant
1. Create an etherpad document and export it as an html file.
1. please place your html files in the `/input` folder
2. run the `run.[bat|sh]` script of your choice depending on your OS
3. and retrieve the hdoc outputs in the `/output` folder
## Unsupported
- Markdown
- Author paternity
- Etherpad timeline
- Chat
## Known bugs
- Nested lists in lists are not supported
Example :
`<ul>
<li>
<ul>
<li>
Never gonna give you up.
</li>
</ul>
</li>
<ul>`
- As a consequence, etherpad indentation is not supported because it is coded as nested lists.
## TODO
- Markdown
## Technical notes
### Description of etherpad_to_hdoc.ant
#### Prelude
- Importation of necessary classes (antlib, htmlcleaner, jing)
- Creation of directories architecture tree
#### Transformations
- Use of htmlcleaner to transform the input file from html to xhtml. For more info, see http://htmlcleaner.sourceforge.net/index.php.
- Apply html2xhtml.xsl : this xsl extracts the content into <body> tags
- Apply html2xhtmlv1.xsl : this xsl is used as a fix and adds br tag at the end of lists (ul and ol)
- Apply html2xhtmlv2.xsl : this xsl surround text line with p tags and transforms non-hdoc tags into hdoc tags as s, u, strong tags.
- Apply html2xhtml3.xsl : this xsl is used as a fix, it deletes p tags when its child is ul or ol
- Apply xhtml2hdoc.xsl : this xsl transforms the content into hdoc structure
#### Post-transformations actions
- Build hdoc structure
- Jing checks if the output file is validated with the right rng schema
- Zip the directory into hdoc archive
### Supported tags
html tags -> hdoc tags
- u, s, em, strong -> em
- li -> li
- ol -> ol
- br -> p
## Capitalisation
We learned how to use xsl sheets with text file as an input : we had to use regular expressions to extract content.
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project basedir="." name="myantce" default="main">
<property file="etherpad_to_hdoc.properties"/>
<taskdef name="htmlcleaner" classname="org.htmlcleaner.HtmlCleanerForAnt">
<classpath>
<pathelement location="lib/htmlcleaner-2.16.jar"/>
</classpath>
</taskdef>
<taskdef name="jing" classname="com.thaiopensource.relaxng.util.JingTask">
<classpath>
<pathelement location="lib/jing.jar"/>
</classpath>
</taskdef>
<!-- import classes -->
<taskdef resource="net/sf/antcontrib/antlib.xml"/>
<taskdef name="htmlcleaner" classname="org.htmlcleaner.HtmlCleanerForAnt"/>
<taskdef name="jing" classname="com.thaiopensource.relaxng.util.JingTask"/>
<target name="start">
<delete dir="${tmp}" failonerror="false"/>
......@@ -22,64 +14,105 @@
<delete dir="${log}" failonerror="false"/>
<mkdir dir="${log}"/>
</target>
<target name="clean-tmp">
<delete dir="${tmp}" failonerror="false"/>
<mkdir dir="${tmp}"/>
</target>
<!-- Apply Xslt -->
<target name="apply-xslt">
<htmlcleaner src="${in}/pad.html" dest="${tmp}/pad-clean.xml"/>
<xslt in="${xsl}/html2xhtml.xsl" out="${tmp}/tmpPad.xhtml" style="${xsl}/html2xhtml.xsl" />
<xslt in="${tmp}/tmpPad.xhtml" out="${tmp}/tmpPad2.xhtml" style="${xsl}/html2xhtmlv1.xsl" />
<xslt in="${tmp}/tmpPad2.xhtml" out="${tmp}/tmpPad3.xhtml" style="${xsl}/html2xhtmlv2.xsl" />
<!-- a priori la tâche suivante ne sert à rien -->
<xslt in="${tmp}/tmpPad3.xhtml" out="${tmp}/tmpPad4.xhtml" style="${xsl}/html2xhtml3.xsl" />
<xslt in="${tmp}/tmpPad4.xhtml" out="${tmp}/tozip/content.xml" style="${xsl}/xhtml2hdoc.xsl" />
<!-- clean html file (html -> xhtml) -->
<htmlcleaner src="${in}/${fileName}" dest="${tmp}/pad-clean.xml"/>
<!-- calls for pad-clean.xml internally -->
<xslt in="${xsl}/html2xhtml.xsl" out="${tmp}/${properName}/tmpPad.xhtml"
style="${xsl}/html2xhtml.xsl"/>
<delete file="${tmp}/pad-clean.xml"></delete>
<xslt in="${tmp}/${properName}/tmpPad.xhtml" out="${tmp}/${properName}/tmpPad2.xhtml"
style="${xsl}/html2xhtmlv1.xsl"/>
<xslt in="${tmp}/${properName}/tmpPad2.xhtml" out="${tmp}/${properName}/tmpPad3.xhtml"
style="${xsl}/html2xhtmlv2.xsl"/>
<xslt in="${tmp}/${properName}/tmpPad3.xhtml" out="${tmp}/${properName}/tmpPad4.xhtml"
style="${xsl}/html2xhtml3.xsl"/>
<xslt in="${tmp}/${properName}/tmpPad4.xhtml" out="${tmp}/${properName}/tozip/content.xml"
style="${xsl}/xhtml2hdoc.xsl"/>
</target>
<!-- Builds Hdoc structure -->
<target name="construct_hdoc">
<mkdir dir="${tmp}/tozip/" />
<mkdir dir="${tmp}/META-INF/" />
<mkdir dir="${tmp}/tozip/META-INF" />
<mkdir dir="${tmp}/META-INF" />
<mkdir dir="${tmp}/${properName}/tozip/"/>
<mkdir dir="${tmp}/${properName}/META-INF/"/>
<mkdir dir="${tmp}/${properName}/tozip/META-INF"/>
<!-- container -->
<touch file="${tmp}/META-INF/container.xml" />
<echoxml file="${tmp}/META-INF/container.xml">
<touch file="${tmp}/${properName}/META-INF/container.xml"/>
<echoxml file="${tmp}/${properName}/META-INF/container.xml">
<container version="1.0">
<rootfiles>
<rootfile full-path="${rootfilename}" media-type="text/xml" />
<rootfile full-path="${rootfilename}" media-type="text/xml"/>
</rootfiles>
</container>
</echoxml>
<xslt in="${tmp}/META-INF/container.xml" out="${tmp}/tozip/META-INF/container.xml" style="${xsl}/addNamespaceToContainer.xsl"/>
<xslt in="${tmp}/${properName}/META-INF/container.xml"
out="${tmp}/${properName}/tozip/META-INF/container.xml"
style="${xsl}/addNamespaceToContainer.xsl"/>
<!-- mimetype -->
<touch file="${tmp}/mimetype" />
<echo message="application/x-hdoc+zip" file="${tmp}/tozip/mimetype" />
<touch file="${tmp}/${properName}/mimetype"/>
<echo message="application/x-hdoc+zip" file="${tmp}/${properName}/tozip/mimetype"/>
</target>
<target name="jing-hdoc">
<jing file="${tmp}/tozip/content.xml" rngfile="schema/xhtml/hdoc1-xhtml.rng"></jing>
<echo>content.xml respecte le schema hdoc1-xhtml.rng</echo>
<target name="jing-hdoc">
<jing file="${tmp}/${properName}/tozip/content.xml"
rngfile="${schema}/xhtml/hdoc1-xhtml.rng"/>
</target>
<!-- Zip Hdoc container -->
<target name="zip">
<zip basedir="${tmp}/tozip/" destfile="${out}/output.hdoc" />
<zip basedir="${tmp}/${properName}/tozip/" destfile="${out}/${properName}.hdoc"/>
</target>
<target name="main">
<antcall target="start"></antcall>
<antcall target="apply-xslt"></antcall>
<antcall target="construct_hdoc"></antcall>
<antcall target="jing-hdoc"></antcall>
<antcall target="zip"></antcall>
<antcall target="clean-tmp"></antcall>
<antcall target="start"/>
<for param="inputPad">
<path>
<fileset dir="${in}" includes="*.html"/>
</path>
<sequential>
<local name="padFileName"/>
<basename property="padFileName" file="@{inputPad}"/>
<local name="properFileName"/>
<basename property="properFileName" file="@{inputPad}" suffix=".html"/>
<echo>============ Processing pad : ${properFileName} ============</echo>
<antcall target="apply-xslt">
<param name="fileName" value="${padFileName}"/>
<param name="properName" value="${properFileName}"/>
</antcall>
<antcall target="construct_hdoc">
<param name="properName" value="${properFileName}"/>
</antcall>
<trycatch>
<try>
<!-- schema verification -->
<antcall target="jing-hdoc">
<param name="properName" value="${properFileName}"/>
</antcall>
<!-- zip hdoc if the schema is verified -->
<antcall target="zip">
<param name="properName" value="${properFileName}"/>
</antcall>
<!-- delete tmp files if everything went fine -->
<delete dir="${tmp}/${properFileName}" failonerror="false"/>
</try>
<catch>
<echo>WARNING</echo>
<echo>${properFileName} : the outptut doesn't match the hdoc schema</echo>
<echo>see the Jing log above and the tmp files</echo>
</catch>
</trycatch>
</sequential>
</for>
</target>
</project>
\ No newline at end of file
</project>
......@@ -4,5 +4,6 @@ tmp = ${basedir}/tmp
xsl = ${basedir}/xsl
lib = ${basedir}/lib
log = ${basedir}/log
schema = ../schemas
rootfilename = content.xml
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<grammar
xmlns="http://relaxng.org/ns/structure/1.0"
xmlns:a="http://relaxng.org/ns/compatibility/annotations/1.0"
datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes"
ns="urn:utc.fr:ics:hdoc:container"
>
<a:documentation>This schema describes the META-INF/container.xml file for hdoc format</a:documentation>
<a:documentation>This schema is a derivation of http://www.idpf.org/epub/30/schema/ocf-container-30.rnc
from EPUB Open Container Format (http://www.idpf.org/epub/30/spec/epub30-ocf.html)
</a:documentation>
<start>
<element name="container">
<attribute name="version">
<value>1.0</value>
</attribute>
<element name="rootfiles">
<element name="rootfile">
<attribute name="full-path">
<data type="anyURI"/>
</attribute>
<attribute name="media-type">
<value>text/xml</value>
</attribute>
</element>
</element>
</element>
</start>
</grammar>
<?xml version="1.0" encoding="UTF-8"?>
<grammar xmlns="http://relaxng.org/ns/structure/1.0"
xmlns:a="http://relaxng.org/ns/compatibility/annotations/1.0"
datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes"
ns="http://www.utc.fr/ics/hdoc/xhtml">
<a:documentation>This schema describes the content file for hdoc format</a:documentation>
<a:documentation>This schema is a restriction of XHTML (it validates XHTML
schema, if namespace is changed to http://www.w3.org/1999/xhtml)</a:documentation>
<a:documentation>Last update : 2014-12-22</a:documentation>
<a:documentation>Changes since last update :
Adding biblio management via a href='' data-hdoc-type='bibtexml'
</a:documentation>
<start>
<element name="html">
<ref name="Head"/>
<ref name="Body"/>
</element>
</start>
<define name="Head">
<element name="head">
<!-- mandatory : document title -->
<element name="title">
<text/>
</element>
<!-- mandatory : charset=utf-8 -->
<element name="meta">
<attribute name="charset">
<value>utf-8</value>
</attribute>
</element>
<!-- optional : hdoc source generator -->
<optional>
<element name="meta">
<attribute name="name">
<value>generator</value>
</attribute>
<attribute name="content">
<choice>
<value>SimpleOptim</value>
<value>HdocConverter/SimpleChain</value>
<value>SimpleChain</value>
<value>HdocConverter/Etherpad</value>
<value>HdocConverter/Opale3.4</value>
<value>HdocConverter/Dokiel4.0</value>
<value>HdocConverter/Optim1.2</value>
<value>HdocConverter/Topaze1.3</value>
<value>HdocConverter/wikipedia</value>
<value>HdocConverter/Wordpress</value>
<value>HdocConverter/OpenDocumentText</value>
<value>HdocConverter/LaTeX</value>
<value>HdocConverter/OPML</value>
<value>HdocConverter/DITA</value>
</choice>
</attribute>
</element>
</optional>
<!-- optional : metadata about content ; TODO : date...-->
<zeroOrMore>
<element name="meta">
<attribute name="name">
<choice>
<value>description</value>
<value>keywords</value>
<value>author</value>
<value>rights</value>
<value>date</value>
</choice>
</attribute>
<attribute name="content">
<text/>
</attribute>
</element>
</zeroOrMore>
</element>
</define>
<define name="Body">
<element name="body">
<ref name="Structure"/>
</element>
</define>
<define name="Structure">
<oneOrMore>
<ref name="Section"/>
</oneOrMore>
</define>
<!-- Recursive and hierachical organization of the document -->
<define name="Section">
<element name="section">
<!-- optional typing -->
<ref name="SectionType"/>
<!-- mandatory : title (header > h1) -->
<element name="header">
<element name="h1">
<text/>
</element>
<optional>
<element name="div">
<attribute name="data-hdoc-type">
<value>author</value>
</attribute>
<text/>
</element>
</optional>
<optional>
<element name="div">
<attribute name="data-hdoc-type">
<value>date</value>
</attribute>
<data type="date"/>
</element>
</optional>
<optional>
<element name="div">
<attribute name="data-hdoc-type">
<value>rights</value>
</attribute>
<text/>
</element>
</optional>
<optional>
<element name="div">
<attribute name="data-hdoc-type">
<value>introduction</value>
</attribute>
<text/>
</element>
</optional>
</element>
<!-- optional content of the section-->
<optional>
<ref name="Content"/>
</optional>
<!-- optional internal sub-structrure -->
<optional>
<ref name="Structure"/>
</optional>
<!-- optional footer -->
<optional>
<element name="footer">
<optional>
<element name="div">
<attribute name="data-hdoc-type">
<value>conclusion</value>
</attribute>
<text/>
</element>
</optional>
<optional>
<element name="div">
<attribute name="data-hdoc-type">
<value>tags</value>
</attribute>
<oneOrMore>
<element name="span">
<text/>
</element>
</oneOrMore>
</element>
</optional>
<optional>
<element name="div">
<attribute name="data-hdoc-type">
<value>categories</value>
</attribute>
<oneOrMore>
<element name="span">
<text/>
</element>
</oneOrMore>
</element>
</optional>
</element>
</optional>
</element>
</define>
<!-- A content is a sequence of blocs -->
<define name="Content">
<oneOrMore>
<element name="div">
<!-- optional typing attribute -->
<ref name="DivType"/>
<!-- Optional title of a div -->
<optional>
<element name="h6">
<text/>
</element>
</optional>
<ref name="Flow"/>
</element>
</oneOrMore>
</define>
<!-- A flow is a sequence of media : text, table, image, audio, video, others -->
<define name="Flow">
<oneOrMore>
<choice>
<ref name="Text"/>
<ref name="Table"/>
<ref name="Image"/>
<ref name="Audio"/>
<ref name="Video"/>
<ref name="Object"/>
</choice>
</oneOrMore>
</define>
<!-- Text -->
<define name="Text">
<oneOrMore>
<choice>
<ref name="P"/>
<ref name="L"/>
</choice>
</oneOrMore>
</define>
<!-- Paragraphe -->
<define name="P">
<element name="p">
<ref name="Inline"/>
</element>
</define>
<!-- Inline elements -->
<define name="Inline">
<!-- TODO : cite, code, sub, sup... -->
<zeroOrMore>
<choice>
<text/>
<element name="em">
<ref name="Inline"/>
</element>
<element name="i">
<ref name="Inline"/>
</element>
<element name="q">
<ref name="Inline"/>
</element>
<element name="sub">
<ref name="Inline"/>
</element>
<element name="sup">
<ref name="Inline"/>
</element>
<element name="a">
<attribute name="href">
<data type="anyURI"/>
</attribute>
<optional>
<!-- Biblio reference management -->
<attribute name="data-hdoc-type">
<value>bibtexml</value>
</attribute>
</optional>
<ref name="Inline"/>
</element>
<element name="span">
<ref name="SpanType"/>
<ref name="Inline"/>
</element>
</choice>
</zeroOrMore>
</define>
<!-- List -->
<define name="L">
<choice>
<element name="ul">
<ref name="I"/>
</element>
<element name="ol">
<ref name="I"/>
</element>
</choice>
</define>
<!-- List item -->