Commit caa2e203 authored by Gregory's avatar Gregory

Merge branch 'master' of https://gitlab.utc.fr/crozatst/hdoc

parents f194c1a3 349c6133
...@@ -6,8 +6,8 @@ http://www.gnu.org/licenses/gpl-3.0.txt ...@@ -6,8 +6,8 @@ http://www.gnu.org/licenses/gpl-3.0.txt
## Credits ## Credits
- 2016 - 2016
- - Etienne Chognard - Etienne Chognard
- - Fabien Boucaud - Fabien Boucaud
- 2015 - 2015
- Jean-Côme Douteau - Jean-Côme Douteau
- Gabrielle Rit - Gabrielle Rit
...@@ -54,7 +54,14 @@ Example : ...@@ -54,7 +54,14 @@ Example :
Currently available on: https://framemo.org/framapad_to_opale Currently available on: https://framemo.org/framapad_to_opale
See also : https://bimestriel.framapad.org/p/nf29_framapad_to_opale for the full documentation of our working process.
## TODO ## TODO
- Nested Lists
- Indentation
- Titles (and therefore Structure of the doc)
- Coloured text
- Code
- Markdown - Markdown
## Technical notes ## Technical notes
...@@ -85,4 +92,4 @@ Currently available on: https://framemo.org/framapad_to_opale ...@@ -85,4 +92,4 @@ Currently available on: https://framemo.org/framapad_to_opale
- br -> p - br -> p
## Capitalisation ## Capitalisation
Using regular expression with xsl is a good way to parse a non xml file. Using regular expression with xsl is a good way to parse a non xml file.
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project basedir="." name="myantce" default="main"> <project basedir="." name="myantce" default="main">
<property file="etherpad_to_hdoc.properties"/> <property file="framapad_to_hdoc.properties"/>
<!-- import classes --> <!-- import classes -->
<taskdef resource="net/sf/antcontrib/antlib.xml"/> <taskdef resource="net/sf/antcontrib/antlib.xml"/>
<taskdef name="htmlcleaner" classname="org.htmlcleaner.HtmlCleanerForAnt"/> <taskdef name="htmlcleaner" classname="org.htmlcleaner.HtmlCleanerForAnt"/>
......
@echo off @echo off
set lib=lib set lib=lib
set ant=etherpad_to_hdoc.ant set ant=framapad_to_hdoc.ant
set antparam=-Dprogram.param=%1 set antparam=-Dprogram.param=%1
set scJarList=%lib%\* set scJarList=%lib%\*
......
#!/bin/sh #!/bin/sh
lib="lib" lib="lib"
ant="etherpad_to_hdoc.ant" ant="framapad_to_hdoc.ant"
antparam="-Dprogram.param=$1" antparam="-Dprogram.param=$1"
#Recherche de java et controle que se soit une version SUN #Recherche de java et controle que se soit une version SUN
......
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project basedir="." name="myantce" default="main"> <project basedir="." name="myantce" default="main">
<property file="etherpad_to_opale.properties"/> <property file="framapad_to_opale.properties"/>
<!-- CHECK FOR OS FAMILY --> <!-- CHECK FOR OS FAMILY -->
<condition property="is_windows"> <condition property="is_windows">
<os family="windows"/> <os family="windows"/>
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
<include name="*.html"/> <include name="*.html"/>
</fileset> </fileset>
</copy> </copy>
<exec dir="../etherpad_to_hdoc" executable="run.bat"/> <exec dir="../framapad_to_hdoc" executable="run.bat"/>
</target> </target>
<target name="to_hdoc_unix" if="${is_unix}"> <target name="to_hdoc_unix" if="${is_unix}">
...@@ -38,8 +38,8 @@ ...@@ -38,8 +38,8 @@
<include name="*.html"/> <include name="*.html"/>
</fileset> </fileset>
</copy> </copy>
<exec executable="/bin/bash" dir="../etherpad_to_hdoc"> <exec executable="/bin/bash" dir="../framapad_to_hdoc">
<arg value="../etherpad_to_hdoc/run.sh"/> <arg value="../framapad_to_hdoc/run.sh"/>
</exec> </exec>
</target> </target>
...@@ -90,4 +90,4 @@ ...@@ -90,4 +90,4 @@
<antcall target="to_opale_windows"/> <antcall target="to_opale_windows"/>
<antcall target="to_opale_end"/> <antcall target="to_opale_end"/>
</target> </target>
</project> </project>
\ No newline at end of file
...@@ -5,8 +5,8 @@ xsl = ${basedir}/xsl ...@@ -5,8 +5,8 @@ xsl = ${basedir}/xsl
lib = ${basedir}/lib lib = ${basedir}/lib
log = ${basedir}/log log = ${basedir}/log
eth_in = ../etherpad_to_hdoc/input eth_in = ../framapad_to_hdoc/input
eth_out = ../etherpad_to_hdoc/output eth_out = ../framapad_to_hdoc/output
opa_in = ../hdoc_to_opale/input opa_in = ../hdoc_to_opale/input
opa_out = ../hdoc_to_opale/output opa_out = ../hdoc_to_opale/output
rootfilename = content.xml rootfilename = content.xml
\ No newline at end of file
@echo off @echo off
set lib=lib set lib=lib
set ant=etherpad_to_opale.ant set ant=framapad_to_opale.ant
set antparam=-Dprogram.param=%1 set antparam=-Dprogram.param=%1
set scJarList=%lib%\* set scJarList=%lib%\*
......
#!/bin/sh #!/bin/sh
lib="lib" lib="lib"
ant="etherpad_to_opale.ant" ant="framapad_to_opale.ant"
antparam="-Dprogram.param=$1" antparam="-Dprogram.param=$1"
#Recherche de java et controle que se soit une version SUN #Recherche de java et controle que se soit une version SUN
......
Converter hdoc_to_basex
-----------------------
The purpose of this converter is to obtain an XML data file suitable for importation into basex for futher XQuery requests from a HDOC file
License GPL3.0
--------------
http://www.gnu.org/licenses/gpl-3.0.txt
Credits
-------
* Simei YIN
* Baptiste MONTANGE
Dependance
----------
This project can be used alone if you want to import an HDOC file into basex.
## User stories
------------------
- Among a group of courses, user can search by title, author or keywords of the course.
- By searching a certain keyword, user can obtain the sections that contain it with their hierarchy levels in the course.
- In a certain section, by searching a key word, user can obtain paragraphes that contain it.
- User can get definitions related to a keyword
- User can get examples whose titles contain a keyword
Step by step :
[Step 1 : File transformation]
- Put the files .hdoc you want to deal with in the folder [input](https://gitlab.utc.fr/crozatst/hdoc/tree/master/hdoc_to_basex/input)
- Run the transformation progam (Win : double click run.bat, Linux : execute run.sh)
[Step 2 : Create data base in basex]
- Download and install [BaseX](http://basex.org/products/download/all-downloads/)
- Run BasexGui
- In the Text Editor of BaseX, open the command script "createbd.bxs" in folder [/basex/command] (https://gitlab.utc.fr/crozatst/hdoc/tree/master/hdoc_to_basex/basex/command).
Follow the instructions in the script, and then execute it.
[Step 3 : Make XQuery request]
-
\ No newline at end of file
# This script allows for creation of a data base in baseX from the xml files in the folder output
# Before executing this script, please make sure that you have successfully finished the transformation in the step 1 in the "user story" of the README.md
# Syntax : CREATE DB [name] ([input])
# Example :
CREATE DB myDB D:\School\UTC\GI04\NF29\Projet\hdoc\hdoc_to_basex\output
#CREATE DB myDB [..the repertory of the projet in your local disk..]/hdoc/hdoc_to_basex/output
# P.S. If you get error message : Resource "..." not found, please confirm your file repertory is correct
\ No newline at end of file
(: This script will return documents by searching its author :)
(: We can assign a Regular Expression to the variable $author :)
(: For example, $author := '^Baptiste Montangé$', to search for an exact name:)
(: For example, $name := 'Montangé', to search for documents whose author named Coutant:)
(: Remark : Accents in the authors' names have been taken care of :)
<documents>{
let $name := 'Montangé'
let $name_noAcc := translate($name, 'áàâäéèêëíìîïóòôöúùûü','aaaaeeeeiiiioooouuuu')
for $doc in //document
return
for $author in $doc/authors/author
let $titre := $doc/titre
let $author_noAcc := translate($author, 'áàâäéèêëíìîïóòôöúùûü','aaaaeeeeiiiioooouuuu')
where matches($author_noAcc, $name_noAcc)
group by $titre (: Avoid duplications of documents by their titre:)
return $doc
}</documents>
\ No newline at end of file
(: This script will return documents by searching one keyword in their title :)
(: We can assign a Regular Expression to the variable $name :)
(: For example, $name := '^NF29_HdocEtherpad$', to search for an exact name:)
(: For example, $name := 'NF29', to search for documents whose name contains 'NF29':)
<documents>{
let $name := '^NF29_HdocEtherpad$'
for $doc in //document
where matches($doc/titre, $name, "i")
return $doc
}</documents>
\ No newline at end of file
lib=${basedir}/lib
log=${basedir}/log
xsl=${basedir}/xsl
in=${basedir}/input
out=${basedir}/output
tmp=${basedir}/tmp
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project basedir="." name="myantce" default="convert">
<taskdef resource="net/sf/antcontrib/antlib.xml"/>
<property file="build.properties"/>
<target name="convert">
<!-- Preparation for the file transformation : delete old folders and create new folders -->
<mkdir dir="${tmp}"/>
<delete dir="${out}" failonerror="false"/>
<mkdir dir="${out}"/>
<delete dir="${log}" failonerror="false"/>
<mkdir dir="${log}"/>
<echo message="DEBUT"/>
<!-- Convert all the hdoc files in the directory ${in} to data xml files that will be imported to basex later.
Fonctions "UnzipHdocFile" and "content" will be called. -->
<for param="inputFile">
<path>
<fileset dir="${in}" includes="**/*.hdoc"/>
</path>
<sequential>
<local name="filename"/>
<basename property="filename" file="@{inputFile}"/>
<antcall target="UnzipHdocFile">
<param name="filename" value="${filename}"/>
</antcall>
<antcall target="content">
<param name="filename" value="${filename}"/>
</antcall>
<echo message="FIN"/>
</sequential>
</for>
<!-- Clean-->
<delete dir="${tmp}" failonerror="false"/>
</target>
<target name="UnzipHdocFile">
<!-- Unzip the input hdoc file. Decompressed folder is named "decompressedHdoc" : this name is the only one which
refers to the hdoc file furthermore in this project. -->
<unzip src="${in}/${filename}" dest="${tmp}/${filename}/decompressedHdoc"/>
<chmod dir="${tmp}/${filename}/decompressedHdoc" perm="777"/>
<echo message="${tmp}/${filename}/decompressedHdoc"/>
</target>
<target name="content" >
<!-- Transformation of a xml file decompressed from hdoc file to data xml file to be imported to basex.
The transformation will be done in terms of the xsl file in ${xsl}.-->
<xslt in="${tmp}/${filename}/decompressedHdoc/content.xml" out="${out}/${filename}_data.xml" style="${xsl}/transformation.xsl" processor="org.apache.tools.ant.taskdefs.optional.TraXLiaison">
<param name="filename" expression="${filename}"/>
<param name="lib" expression="${lib}"/>
</xslt>
<echo message="${filename}" />
</target>
</project>
\ No newline at end of file
@echo off
set lib=lib
set ant=hdoc_to_basex.ant
set antparam=-Dprogram.param=%1
set scJarList=%lib%\*
java.exe -classpath "%scJarList%" -Xmx150m org.apache.tools.ant.Main -buildfile %ant% %antparam%
pause
REM start /MIN java.exe -classpath "%scJarList%" -Xmx150m org.apache.tools.ant.Main -buildfile %ant% %antparam%
#!/bin/sh
lib="lib"
ant="hdoc_to_basex.ant"
antparam="-Dprogram.param=$1"
#Recherche de java et controle que se soit une version SUN
vJavaCmd="java"
xCheckJava () {
vInputVarName=\$"$1"
vInputVarVal=`eval "expr \"$vInputVarName\" "`
if [ -z "$vInputVarVal" ];then
eval "$1=false"
return
fi
vSunJavaFound=`$vInputVarVal -version 2>&1 | grep -Eo -m 1 "(HotSpot)|(OpenJDK)"`
if [ "$vSunJavaFound" != "HotSpot" ] && [ "$vSunJavaFound" != "OpenJDK" ] ; then
eval "$1=false"
return
fi
}
xCheckJava vJavaCmd
if [ "$vJavaCmd" = "false" ]; then
vJavaCmd="$JAVA_HOME/bin/java"
xCheckJava vJavaCmd
if [ "$vJavaCmd" = "false" ]; then
echo "ERREUR: JRE de SUN introuvable. Veuillez déclarer la variable d'environnement JAVA_HOME."
exit 1
fi
fi
#Lancer la commande
scJarList="$lib/*"
$vJavaCmd -classpath "$scJarList:" -Xmx150m org.apache.tools.ant.Main -buildfile $ant $antparam
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:h="http://www.utc.fr/ics/hdoc/xhtml"
exclude-result-prefixes="xs" version="2.0">
<xsl:template match="h:html">
<document>
<xsl:apply-templates mode="title"/>
</document>
</xsl:template>
<xsl:template match="h:head" mode="title">
<titre>
<xsl:value-of select="./h:title"/>
</titre>
<xsl:apply-templates select="./h:meta"/>
</xsl:template>
<xsl:template match="h:meta[@name='author']">
<authors>
<xsl:for-each select="tokenize(@content,', \d')">
<xsl:if test="position() = 1">
<xsl:for-each select="tokenize(.,', ')">
<author>
<xsl:value-of select="."/>
</author>
</xsl:for-each>
</xsl:if>
</xsl:for-each>
</authors>
</xsl:template>
<xsl:template match="h:body" mode="title">
<sections>
<xsl:apply-templates select="./h:section"/>
</sections>
</xsl:template>
<xsl:template match="h:section">
<section>
<titresection>
<xsl:value-of select="./h:header/h:h1"/>
</titresection>
<contenu>
<paragraphes>
<xsl:apply-templates select="h:div"/>
</paragraphes>
<xsl:apply-templates select="h:section" mode="soussect"/>
</contenu>
</section>
</xsl:template>
<xsl:template match="h:section" mode="soussect">
<soussection></soussection>
</xsl:template>
<xsl:template match="h:div" >
<xsl:if test="h:p">
<paragraphe>
<xsl:value-of select="."/>
</paragraphe>
</xsl:if>
<xsl:if test="h:ul">
<xsl:apply-templates select="h:ul"></xsl:apply-templates>
</xsl:if>
</xsl:template>
<xsl:template match="h:ul">
<xsl:for-each select="h:li">
<paragraphe>
<xsl:value-of select="h:p"/>
</paragraphe>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
This diff is collapsed.
...@@ -18,27 +18,40 @@ Credits ...@@ -18,27 +18,40 @@ Credits
Presentation Presentation
------------ ------------
"Hdoc to Epub" is an hdoc converter to epub files. It's a set of ANT scripts and XSL files
Dependencies Dependencies
------------ ------------
There's no particular dependencies needed to run the converter.
User Documentation User Documentation
------------------ ------------------
### Scenario
La personne possède un hdoc et voudrait le convertir en epub. Il se dirige vers le site hdoc et télécharge le zip. Il extrait le zip et se rend vers le dossier hdoc_to_epub. Il lit le README.md et suit les directives pour obtenir son format epub. Pour cela, il doit coller son hdoc dans le dossier input et lancer l'exécutable "run". Enfin, il aura son epub dans le dossier output.
Unsupported Unsupported
----------- -----------
Known bugs Known bugs
---------- ----------
Problème dans le hdoc : contient des images mais ne sont pas référencées.
Problème avec les keywords : le hdoc contient des keywords mais nous ne savons pas où les utiliser.
Todo list Todo list
--------- ---------
Validation de l'epub actuel
Mise en place de la première page
Validation
Étude compatibilité entre EPUB2 et EPUB3
Technical Notes Technical Notes
--------------- ---------------
<?xml version="1.0"?>
<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
<rootfiles>
<rootfile full-path="oebps/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
application/epub+zip
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<!-- test.ant --> <project name="hdoc_to_epub" basedir="." default="convert">
<project>
<property name="p">Default</property> <taskdef resource="net/sf/antcontrib/antlib.xml"/>
<echo message="${p}"/> <taskdef name="jing" classname="com.thaiopensource.relaxng.util.JingTask"/>
<property name="in" location="${basedir}/input"/>
<property name="out" location="${basedir}/output"/>
<property name="tmp" location="${basedir}/tmp"/>
<property name="xsl" location="${basedir}/xsl"/>
<property name="file" location="${basedir}/file"/>
<property name="schema" location="${basedir}/schema"/>
<property name="lib" location="${basedir}/lib"/>
<property name="tmpRetour" location="${basedir}/tmp/retour"/>
<target name="convert">
<delete dir="${tmp}" failonerror="false"/>
<sleep seconds="1"/>
<mkdir dir="${tmp}"/>
<delete dir="${out}" failonerror="false"/>
<sleep seconds="1"/>
<mkdir dir="${out}"/>
<mkdir dir="${tmpRetour}"/>
<for param="inputFile">
<path>
<fileset dir="${in}" includes="**/*.hdoc"/>
</path>
<sequential>
<local name="filename"/>
<basename property="filename" file="@{inputFile}"/>
<antcall target="UnzipHdocFile">
<param name="filename" value="${filename}"/>
</antcall>
<antcall target="ValidateInput">
<param name="filename" value="${filename}"/>
</antcall>
<antcall target="MakeOCF">
<param name="filename" value="${filename}"/>
</antcall>
<antcall target="MakeOPF">
<param name="filename" value="${filename}"/>
</antcall>
<antcall target="MakeOPS">
<param name="filename" value="${filename}"/>
</antcall>
<antcall target="ZipEpubFile">
<param name="filename" value="${filename}"/>
</antcall>
<antcall target="ValidateOutput">
<param name="filename" value="${filename}"/>
</antcall>
<antcall target="CleanTmp"/>
</sequential>
</for>
</target>
<target name="UnzipHdocFile">
<!-- Unzip the input hdoc file. Decompressed folder is named "decompressedHdoc" : this name is the only one which
refers to the hdoc file furthermore in this project. -->
<unzip src="${in}/${filename}" dest="${tmp}/${filename}/decompressedHdoc"/>
<chmod dir="${tmp}/${filename}/decompressedHdoc" perm="777"/>
</target>
<!-- Validating the XML container file -->
<target name="ValidateInput">
<trycatch property="foo" reference="bar">
<try>
<jing file="${tmp}/${filename}/decompressedHdoc/META-INF/container.xml" rngfile="${schema}/hdoc1-container.rng"></jing>
</try>
<catch>
<echo>Validation failed</echo>
</catch>
</trycatch>
</target>
<target name="MakeOCF">
<mkdir dir="${tmpRetour}/META-INF"/>
<mkdir dir="${tmpRetour}/oebps"/>
<copy file="${file}/mimetype" todir="${tmpRetour}"/>
<copy file="${file}/container.xml" todir="${tmpRetour}/META-INF"/>
</target>
<target name="MakeOPF">
<xslt in="${tmp}/${filename}/decompressedHdoc/content.xml" out="${tmpRetour}/oebps/content.opf" classpath="${lib}/saxon9he.jar" style="${xsl}/content.xsl"/>
</target>
<target name="MakeOPS">
<mkdir dir="${tmpRetour}/oebps/images"/>
<copy todir="${tmpRetour}/oebps/images">
<fileset dir="${tmp}/${filename}/decompressedHdoc/re" includes="**"/>
</copy>
<mkdir dir="${tmpRetour}/oebps/styles"/>
<mkdir dir="${tmpRetour}/oebps/chapitres"/>
<xslt in="${tmp}/${filename}/decompressedHdoc/content.xml" out="${tmpRetour}/oebps/tableDesMatieres.ncx" classpath="${lib}/saxon9he.jar" style="${xsl}/tdmncx.xsl"/>
<xslt destdir="${tmpRetour}/oebps/chapitres" basedir="${tmp}/${filename}/decompressedHdoc" includes="content.xml" classpath="${lib}/saxon9he.jar" style="${xsl}/chapitre.xsl"/>
<delete file="${tmpRetour}/oebps/chapitres/content.html"/>
</target>
<target name="ZipEpubFile">
<propertyregex property="properFilename" input="${filename}" regexp=".hdoc" replace="" casesensitive="false" override="true" />
<zip destfile="${out}/${properFilename}.epub" update="true" encoding="UTF-8">
<zipfileset dir="${tmpRetour}/META-INF"
includes="*" prefix="META-INF"/>
<zipfileset dir="${tmpRetour}/oebps"
includes="**" prefix="oebps"/>
</zip>
<!-- zip64Mode="never" -->
<zip destfile="${out}/${properFilename}.epub" update="true" compress="false" encoding="UTF-8" keepcompression="true" createunicodeextrafields="never">
<zipfileset dir="${tmpRetour}"
includes="mimetype"/>
</zip>
</target>
<target name="ValidateOutput">
</target>
<target name="CleanTmp">
<delete dir="${tmp}" failonerror="false"/>
</target>
</project> </project>
<?xml version="1.0" encoding="UTF-8"?>
<grammar
xmlns="http://relaxng.org/ns/structure/1.0"
xmlns:a="http://relaxng.org/ns/compatibility/annotations/1.0"
datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes"
ns="urn:utc.fr:ics:hdoc:container"
>
<a:documentation>This schema describes the META-INF/container.xml file for hdoc format</a:documentation>
<a:documentation>This schema is a derivation of http://www.idpf.org/epub/30/schema/ocf-container-30.rnc
from EPUB Open Container Format (http://www.idpf.org/epub/30/spec/epub30-ocf.html)
</a:documentation>
<start>
<element name="container">
<attribute name="version">
<value>1.0</value>
</attribute>
<element name="rootfiles">
<element name="rootfile">
<attribute name="full-path">
<data type="anyURI"/>
</attribute>
<attribute name="media-type">
<value>text/xml</value>