wiki_to_hdoc.xsl 21.1 KB
Newer Older
1 2
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
3 4 5
    xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:functx="http://www.functx.com"
    exclude-result-prefixes="xs" version="2.0" xmlns="http://www.utc.fr/ics/hdoc/xhtml">

6
    <xsl:output method="xhtml" indent="yes"/>
7
    <xsl:preserve-space elements="pre"/>
8

9 10 11 12
    <xsl:template match="*"/>
    <xsl:template match="text()">
        <xsl:value-of select="."/>
    </xsl:template>
13

14 15 16 17
    <xsl:template match="html">
        <!-- Schema link -->
        <xsl:processing-instruction name="oxygen">RNGSchema="http://scenari.utc.fr/hdoc/schemas/xhtml/hdoc1-xhtml.rng" type="xml"</xsl:processing-instruction>
        <xsl:text>&#10;</xsl:text>
18

19 20 21 22 23 24 25 26 27 28
        <!-- html content -->
        <html>
            <head>
                <xsl:apply-templates select="head"/>
            </head>
            <body>
                <xsl:apply-templates select="body"/>
            </body>
        </html>
    </xsl:template>
29

30 31 32
    <!-- Head template -->
    <xsl:template match="head">
        <xsl:apply-templates select="title"/>
33
        <meta charset="utf-8"/>
34 35 36
        <meta name="generator" content="HdocConverter/wikipedia"/>
        <meta name="author" content="Wikipedia"/>
    </xsl:template>
37

38 39 40 41
    <!-- Body template -->
    <xsl:template match="body">
        <!-- If there is h1 title with firstHeading class, this has to be the first section title -->
        <xsl:apply-templates select="//h1[1]" mode="introduction"/>
42

43 44 45
        <!-- Next are all parts of the document: h2 -->
        <xsl:apply-templates select="//h2"/>
    </xsl:template>
46

47
    <xsl:template match="title">
48 49 50
        <title>
            <xsl:value-of select="."/>
        </title>
51
    </xsl:template>
52

53 54 55 56 57 58
    <!-- Beginning of the wikipedia page: introduction paragraph -->
    <xsl:template match="h1[1]" mode="introduction">
        <section data-hdoc-type="introduction">
            <header>
                <h1>Introduction</h1>
            </header>
59

60
            <!-- Introduction text before first h2 (first section) -->
haroldcb's avatar
haroldcb committed
61
            <xsl:if test="//p[count(preceding::h2)=0 and not(ancestor::td)]">
62 63 64 65
                <div>
                    <xsl:apply-templates select="//p[count(preceding::h2)=0 and not(ancestor::td)]"
                        mode="textOnly"/>
                </div>
66 67 68 69 70 71 72 73 74 75 76 77
            </xsl:if>
        </section>
    </xsl:template>

    <!-- Wikipedia sections and subsections -->
    <xsl:template match="h2|h3|h4|h5">
        <section>
            <!-- h3 sections are opale "grains" -->
            <xsl:if test="self::h3">
                <xsl:attribute name="data-hdoc-type">unit-of-content</xsl:attribute>
            </xsl:if>
            <header>
78 79 80
                <h1>
                    <xsl:apply-templates select="node()" mode="title"/>
                </h1>
81
            </header>
82

83
            <!-- Storing current section to know when apply template has to be called in the next parts of the template -->
84
            <xsl:variable name="currentSectionTitle" select="."/>
85

86 87

            <!-- Apply this template for all elements between the actual title h{2,3,4,5} and the next one  -->
88 89
            <xsl:variable name="nextTitle"
                select="following-sibling::*[starts-with(name(), 'h')][1]"/>
90

91
            <xsl:apply-templates
Anna Lainé's avatar
Anna Lainé committed
92
                select="following-sibling::div[contains(@class,'thumb tleft') or contains(@class,'thumb tright') and not(descendant::img)] intersect $nextTitle/preceding-sibling::*"/>
93

94
            <!--             <xsl:apply-templates select="following-sibling::p/img"/>-->
95
            <!-- If there is text right below the section name, copy it -->
96 97
            <xsl:if
                test="not(following-sibling::*[1] intersect following-sibling::h3) and not(following-sibling::*[1] intersect following-sibling::h4) and not(following-sibling::*[1] intersect following-sibling::h5) and not(following-sibling::*[1] intersect following-sibling::h6)">
98
                <div>
99 100 101 102 103
                    <!--IIICCIIII -->
                    <xsl:for-each
                        select="following-sibling::* intersect following-sibling::h2[1]/preceding-sibling::*">
                        <xsl:if
                            test="not(preceding-sibling::h3 intersect $currentSectionTitle/following-sibling::h3) and not($currentSectionTitle/following-sibling::h4 intersect preceding-sibling::h4) and not($currentSectionTitle/following-sibling::h5 intersect preceding-sibling::h5) and not($currentSectionTitle/following-sibling::h6 intersect preceding-sibling::h6) and not(self::h3) and not(self::h4)  and not(self::h5)  and not(self::h6)">
104
                            <xsl:apply-templates select="." mode="textOnly"/>
105
                            <!--To add images it took me almost 1h and a half I believe strongly that the structure of this page can be changed  and made easy !!  -->
106 107 108 109
                        </xsl:if>
                    </xsl:for-each>
                </div>
            </xsl:if>
110

111 112 113 114
            <!-- Applying template of subsections if any -->
            <xsl:choose>
                <xsl:when test="self::h2">
                    <!-- h2 can have h3 subsections -->
115 116 117
                    <xsl:apply-templates
                        select="following-sibling::h3 intersect following-sibling::h2[1]/preceding-sibling::h3"
                    />
118 119 120
                </xsl:when>
                <xsl:when test="self::h3">
                    <!-- Apply template to h4 subsections of h3. These h4 are below the current h3: previous h3 of these h4 is current h3. -->
121 122
                    <xsl:for-each
                        select="following-sibling::h4 intersect following-sibling::h2[1]/preceding-sibling::h4">
123 124 125 126 127 128 129
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
                <xsl:when test="self::h4">
                    <!-- Apply template aux h5 devant qui sont soit avant un h2, soit un h3, soit un h4 -->
130 131 132 133
                    <xsl:for-each
                        select="following-sibling::h5 intersect following-sibling::h2[1]/preceding-sibling::h5">
                        <xsl:if
                            test="(preceding-sibling::h3[1] intersect $currentSectionTitle/preceding-sibling::h3[1]) and (preceding-sibling::h4[1] intersect $currentSectionTitle)">
134 135 136 137 138 139
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
                <xsl:when test="self::h5">
                    <!-- Apply template aux h6 devant qui sont soit avant un h2, soit un h3, soit un h4 -->
140 141 142 143
                    <xsl:for-each
                        select="following-sibling::h6 intersect following-sibling::h2[1]/preceding-sibling::h6">
                        <xsl:if
                            test="(preceding-sibling::h3[1] intersect $currentSectionTitle/preceding-sibling::h3[1]) and (preceding-sibling::h4[1] intersect $currentSectionTitle/preceding-sibling::h4[1]) and (preceding-sibling::h5[1] intersect $currentSectionTitle)">
144 145 146 147 148 149 150
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
            </xsl:choose>
        </section>
    </xsl:template>
151

152 153 154
    <!-- Wikipedia h6 is not a section in hdoc: div with h6 title -->
    <xsl:template match="h6">
        <div>
155 156 157 158 159
            <h6>
                <xsl:apply-templates select="node()" mode="titleh6"/>
            </h6>

            <xsl:variable name="currentSectionTitle" select="."/>
160
            <!-- Text of h6 is contained between current h6 and next section title (next h3/h4/h5/h6) -->
161 162 163 164
            <xsl:for-each
                select="following-sibling::* intersect following-sibling::h2[1]/preceding-sibling::*">
                <xsl:if
                    test="not(preceding-sibling::h3 intersect $currentSectionTitle/following-sibling::h3) and not($currentSectionTitle/following-sibling::h4 intersect preceding-sibling::h4) and not($currentSectionTitle/following-sibling::h5 intersect preceding-sibling::h5) and not($currentSectionTitle/following-sibling::h6 intersect preceding-sibling::h6) and not(self::h3) and not(self::h4)  and not(self::h5)  and not(self::h6)">
165 166 167 168 169 170 171
                    <xsl:apply-templates select="." mode="textOnly"/>
                </xsl:if>
            </xsl:for-each>
        </div>
    </xsl:template>

    <!-- Text elements not surrounded by div -->
172
    <xsl:template match="p|span|i|ul|ol" mode="textOnly">
173 174 175 176
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
177

178 179 180 181 182 183 184 185
    <!-- Paragraph template -->
    <xsl:template match="p">
        <div>
            <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
                <xsl:apply-templates select="node()" mode="textOnly"/>
            </xsl:element>
        </div>
    </xsl:template>
186

187 188 189
    <!-- li -->
    <xsl:template match="li" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
190 191 192 193
                   
            <p>
                <xsl:apply-templates select="node()" mode="textOnly"/>
            </p>
194 195
        </xsl:element>
    </xsl:template>
196
    <!--Gestion des images integrées dans le texte-->
197
    
198 199 200
    <xsl:template match="li[@class='gallerybox']" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
                <img src="./ressources/{translate(tokenize(.//img/@src, '/')[last()],'?%','_')}" alt="{.//img/@alt}"/>
201 202 203
                 <p>
                     <xsl:value-of select=".//div[@class='galerytext']/p"/>
                 </p>
204 205 206
        </xsl:element>
    </xsl:template>

207 208 209
    <!-- text followed directly by ul not allowed in li -->
    <xsl:template match="li[descendant::ul]" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
210 211 212 213 214
            <p>
                <xsl:apply-templates
                    select="descendant::node() intersect descendant::ul[1]/preceding-sibling::node()"
                    mode="textOnly"/>
            </p>
215 216 217
            <xsl:apply-templates select="descendant::ul" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
218

219 220
    <!-- dl/dt/dd in each in a div -->
    <xsl:template match="dl[descendant::dt]" mode="#all">
221 222 223
        <ul>
            <xsl:apply-templates select="dt"/>
        </ul>
224
    </xsl:template>
225

226 227 228
    <xsl:template match="dl[not(descendant::dt)]" mode="#all">
        <xsl:apply-templates select="dd"/>
    </xsl:template>
229

230 231 232
    <!-- dt contains the title -->
    <xsl:template match="dt">
        <li>
233 234 235 236 237 238
            <p>
                <em>
                    <xsl:apply-templates select="node()" mode="titleh6"/>
                </em>
            </p>

239 240
            <!-- Some dt are followed by multiple dd before next dd: make sure to copy text of each dd before next dt -->
            <xsl:variable name="currentElement" select="."/>
241 242 243
            <xsl:apply-templates
                select="following-sibling::dd[preceding-sibling::dt[1] intersect $currentElement]"/>

244 245 246 247 248 249
            <!-- Some wikipedia articles don't use dd after dt... -->
            <xsl:if test="not(following-sibling::dd[1])">
                <xsl:apply-templates select="../following-sibling::p[1]" mode="textOnly"/>
            </xsl:if>
        </li>
    </xsl:template>
250

251 252
    <!-- dd contains the content -->
    <xsl:template match="dd">
253 254 255
        <p>
            <xsl:apply-templates select="node()"/>
        </p>
256 257
        <xsl:apply-templates select="ul|ol" mode="textOnly"/>
    </xsl:template>
258

259 260 261 262
    <!-- Rules for title elements (h1, h2...) -->
    <xsl:template match="*" mode="title">
        <xsl:apply-templates select="node()" mode="title"/>
    </xsl:template>
263

264 265 266
    <xsl:template match="*" mode="titleh6" priority="2">
        <xsl:value-of select="."/>
    </xsl:template>
267

268 269 270 271 272 273
    <xsl:template match="b" mode="textOnly">
        <!-- b is not allowed, however em is allowed: replacing all b by em -->
        <xsl:element name="em" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
274

275
    <!-- b outside p, wrap it into a p (and convert it to em) -->
276 277 278 279 280 281 282
    <xsl:template match="b[not(ancestor::p) and not(ancestor::li) and not(ancestor::a)]"
        mode="textOnly">
        <p>
            <em>
                <xsl:apply-templates select="node()" mode="textOnly"/>
            </em>
        </p>
283
    </xsl:template>
284

285 286 287 288 289
    <!-- Link elements -->
    <!-- a in title not allowed, only keeping text -->
    <xsl:template match="h2/span/a | h3/span/a | h4/span/a | h5/span/a | h6/span/a" mode="#all">
        <xsl:value-of select="."/>
    </xsl:template>
290

291 292 293 294 295 296
    <xsl:template match="a" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:attribute name="href" select="concat('http://wikipedia.org', @href)"/>
            <xsl:value-of select="."/>
        </xsl:element>
    </xsl:template>
297

298 299 300 301
    <!-- Link elements without actual wikipedia page: keeping only their text -->
    <xsl:template match="a[contains(@class, 'new')]" mode="#all">
        <xsl:value-of select="."/>
    </xsl:template>
302

303
    <!-- Handle Listing -->
304
    <xsl:template match="div[@class='mw-highlight mw-content-ltr']" mode="textOnly">
305 306
        <xsl:apply-templates select="pre" mode="textOnly"/>
    </xsl:template>
307

308 309 310 311
    <xsl:template match="pre" mode="textOnly">
        <xsl:variable name="vUid">
            <xsl:number level="any" count="pre"/>
        </xsl:variable>
312

313 314 315
        <xsl:variable name="listingContent">
            <xsl:copy-of select="@*|node()"/>
        </xsl:variable>
316

317 318 319
        <xsl:variable name="listingContentWithoutBadCharacters">
            <xsl:copy-of select="replace($listingContent,'&amp;','&amp;amp;')"/>
        </xsl:variable>
320

321 322 323
        <xsl:variable name="listingContent2">
            <xsl:value-of select="substring-after($listingContentWithoutBadCharacters,'&#10;')"/>
        </xsl:variable>
324

325 326
        <xsl:variable name="basename">
            <xsl:call-template name="substring-before-last">
327 328
                <xsl:with-param name="string1" select="$listingContent2"/>
                <xsl:with-param name="string2" select="'&#10;'"/>
329 330
            </xsl:call-template>
        </xsl:variable>
331 332
        <object type="application/x-listing" data="listing/listing{$vUid}.xml">
            <!--Modifier le application/pdf lorsque le rng sera changé-->
333 334 335 336 337 338 339 340 341 342
            <xsl:result-document href="listing/listing{$vUid}.xml" method="text">
                <xsl:text disable-output-escaping="no">
&lt;sc:item xmlns:sc="http://www.utc.fr/ics/scenari/v3/core"&gt;
	&lt;op:code xmlns:sp="http://www.utc.fr/ics/scenari/v3/primitive" xmlns:op="utc.fr:ics/opale3"&gt;
		&lt;sc:code mimeType="text/plain"&gt;</xsl:text>
                <xsl:value-of select="$basename"/>
                <xsl:text disable-output-escaping="no">&lt;/sc:code&gt;
	&lt;/op:code&gt;
&lt;/sc:item&gt;
                </xsl:text>
343 344
            </xsl:result-document>
        </object>
345
    </xsl:template>
346

347
    <xsl:template name="substring-before-last">
348 349 350
        <xsl:param name="string1" select="''"/>
        <xsl:param name="string2" select="''"/>

351
        <xsl:if test="$string1 != '' and $string2 != ''">
352 353 354
            <xsl:variable name="head" select="substring-before($string1, $string2)"/>
            <xsl:variable name="tail" select="substring-after($string1, $string2)"/>
            <xsl:value-of select="$head"/>
355
            <xsl:if test="contains($tail, $string2)">
356
                <xsl:value-of select="$string2"/>
357
                <xsl:call-template name="substring-before-last">
358 359
                    <xsl:with-param name="string1" select="$tail"/>
                    <xsl:with-param name="string2" select="$string2"/>
360 361 362
                </xsl:call-template>
            </xsl:if>
        </xsl:if>
363
    </xsl:template>
364 365


haroldcb's avatar
haroldcb committed
366
    <!-- Simple Tables -->
Anna Lainé's avatar
Anna Lainé committed
367
    <xsl:template match="table" mode="#all">
haroldcb's avatar
haroldcb committed
368
        <xsl:choose>
369
            <!-- Tables that doesn't contain colspan, rowspan or included tables -->
370 371
            <xsl:when
                test="not(descendant::td/@colspan | descendant::td/@rowspan | descendant::table)">
haroldcb's avatar
haroldcb committed
372
                <table>
373
                    <xsl:apply-templates select="node()" mode="textOnly"/>
haroldcb's avatar
haroldcb committed
374
                </table>
375
            </xsl:when>
haroldcb's avatar
haroldcb committed
376 377 378
            <xsl:otherwise>
                <xsl:call-template name="TableComplexe"/>
            </xsl:otherwise>
379
        </xsl:choose>
haroldcb's avatar
haroldcb committed
380
    </xsl:template>
381

382
    <!-- Complexe Tables : create extern Ods files and copy content in it-->
haroldcb's avatar
haroldcb committed
383 384 385 386
    <xsl:template name="TableComplexe">
        <xsl:variable name="tableId">
            <xsl:number level="any" count="table"/>
        </xsl:variable>
387 388
        <object type="application/vnd.oasis.opendocument.spreadsheet"
            data="tables/table{$tableId}.ods">
389
            <xsl:result-document href="tables/table{$tableId}.ods" method="html" encoding="utf-8">
haroldcb's avatar
haroldcb committed
390
                <xsl:element name="table">
391 392
                    <xsl:copy-of select="@*|node()"/>
                </xsl:element>
haroldcb's avatar
haroldcb committed
393 394 395
            </xsl:result-document>
        </object>
    </xsl:template>
396

haroldcb's avatar
haroldcb committed
397
    <xsl:template match="caption" mode="textOnly">
398 399 400
        <caption>
            <xsl:value-of select="node()"/>
        </caption>
haroldcb's avatar
haroldcb committed
401
    </xsl:template>
402

haroldcb's avatar
haroldcb committed
403 404 405 406 407
    <xsl:template match="tr" mode="textOnly">
        <tr>
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </tr>
    </xsl:template>
408

haroldcb's avatar
haroldcb committed
409
    <xsl:template match="td | th" mode="textOnly">
410 411
        <xsl:choose>
            <xsl:when test="not(ul|ol)">
412 413 414 415 416 417
                <td>
                    <p>
                        <xsl:apply-templates select="." mode="table"/>
                    </p>
                </td>
            </xsl:when>
418
            <xsl:otherwise>
419 420 421
                <td>
                    <xsl:apply-templates select="." mode="table"/>
                </td>
422
            </xsl:otherwise>
423
        </xsl:choose>
haroldcb's avatar
haroldcb committed
424
    </xsl:template>
425 426 427

    <xsl:template match="a[@class='image']" mode="table"/>

428
    <xsl:template match="b" mode="table" priority="1">
haroldcb's avatar
haroldcb committed
429 430 431 432 433
        <!-- b is not allowed, however em is allowed: replacing all b by em -->
        <xsl:element name="em" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
434

435
    <xsl:template match="td/span[contains(@class, 'mw-edit')]" mode="table"/>
436

haroldcb's avatar
haroldcb committed
437 438
    <!-- Balises pour les abbréviations -->
    <xsl:template match="abbr" mode="table" priority="1">
439
        <xsl:apply-templates select="node()" mode="textOnly"/>
haroldcb's avatar
haroldcb committed
440
    </xsl:template>
441

442
    <xsl:template match="abbr/sup" mode="table" priority="1">
443 444 445
        <sup>
            <xsl:value-of select="."/>
        </sup>
haroldcb's avatar
haroldcb committed
446
    </xsl:template>
447

448 449 450 451 452
    <xsl:template match="ul|ol" mode="table" priority="1">
        <xsl:element name="{local-name()}">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
453

454 455 456 457


    <!-- Images -->
    <xsl:template match="div[contains(@class,'thumb')]">
458
        <div>
459
            <xsl:apply-templates select=".//img" mode="textOnly"/>
460 461
            <xsl:apply-templates select=".//p[@class='thumbcaption']"/>
        </div>
462
    </xsl:template>
463

464
    <xsl:template match="img" mode="textOnly">
465
        <img src="./ressources/{translate(tokenize(@src, '/')[last()],'?%','_')}" alt="{@alt}"/>
466 467
    </xsl:template>

468

469
    <xsl:template match="p[@class='thumbcaption']">
470 471 472
        <p>
            <xsl:value-of select="text()"/>
        </p>
473 474
    </xsl:template>

475

476
    <!-- ===== Ignored content ===== -->
477

478 479
    <!-- Only keeping a with information: we give up page references -->
    <xsl:template match="a[starts-with(@href, '#')]" mode="#all"/>
480

481 482
    <!-- Removing Wikipedia internal sup, they are not useful to us (sup are "cite source / reference" etc...) -->
    <xsl:template match="sup" mode="#all"/>
483

484 485 486
    <!-- Ignoring empty text elements only relevant to Wikipedia -->
    <xsl:template match="p[empty(node())]" mode="#all" priority="2"/>
    <xsl:template match="span[contains(@class, 'mw-edit')]" mode="#all" priority="2"/>
487

488 489
    <!-- Ignoring table of contents h2 -->
    <xsl:template match="div[@id='toctitle']/h2" priority="1"/>
490

491
    <!-- Ignoring end of file h2: see also, etc... -->
492 493 494 495 496 497
    <!--    <xsl:template match="h2[not(following-sibling::p intersect following-sibling::h2[1]/preceding-sibling::p)]"/>-->
    <xsl:template
        match=" h2[following-sibling::ul[li/a[@class='external text'] and preceding-sibling::h2]][last()]"/>
    <xsl:template
        match=" h2[following-sibling::div[@class='references-small decimal'] and preceding-sibling::h2][last()]"/>
    <xsl:template match="h2[following-sibling::h3[span[@id='Notes']]][last()]"/>
498
    <xsl:template match="div[@id='mw-navigation']/h2"/>
499 500
    <xsl:template match="h2[span[@id='Voir_aussi']]"/>
    <xsl:template match="h2[span[@id='Notes_et_r.C3.A9f.C3.A9rences']]"/>
501

502
    <!-- Ignoring divs by default: they are not relevant to us -->
Anna Lainé's avatar
Anna Lainé committed
503
    <xsl:template match="div[not(descendant::table) and not(descendant::img)]" mode="#all"/>
504
</xsl:stylesheet>