wiki_to_hdoc.xsl 18.4 KB
Newer Older
1
2
3
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
4
    xmlns:functx="http://www.functx.com"
5
6
7
8
9
    exclude-result-prefixes="xs"
    version="2.0"
    xmlns="http://www.utc.fr/ics/hdoc/xhtml">
    
    <xsl:output method="xhtml" indent="yes"/>
10
11
    <xsl:preserve-space elements="pre"/>
        
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
    <xsl:template match="*"/>
    <xsl:template match="text()">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="html">
        <!-- Schema link -->
        <xsl:processing-instruction name="oxygen">RNGSchema="http://scenari.utc.fr/hdoc/schemas/xhtml/hdoc1-xhtml.rng" type="xml"</xsl:processing-instruction>
        <xsl:text>&#10;</xsl:text>
        
        <!-- html content -->
        <html>
            <head>
                <xsl:apply-templates select="head"/>
            </head>
            <body>
                <xsl:apply-templates select="body"/>
            </body>
        </html>
    </xsl:template>
    
    <!-- Head template -->
    <xsl:template match="head">
        <xsl:apply-templates select="title"/>
        <meta charset="utf-8" />
        <meta name="generator" content="HdocConverter/wikipedia"/>
        <meta name="author" content="Wikipedia"/>
    </xsl:template>
    
    <!-- Body template -->
    <xsl:template match="body">
        <!-- If there is h1 title with firstHeading class, this has to be the first section title -->
        <xsl:apply-templates select="//h1[1]" mode="introduction"/>
        
        <!-- Next are all parts of the document: h2 -->
        <xsl:apply-templates select="//h2"/>
    </xsl:template>
    
    <xsl:template match="title">
        <title><xsl:value-of select="."/></title>
    </xsl:template>
    
    <!-- Beginning of the wikipedia page: introduction paragraph -->
    <xsl:template match="h1[1]" mode="introduction">
        <section data-hdoc-type="introduction">
            <header>
                <h1>Introduction</h1>
            </header>
            
            <!-- Introduction text before first h2 (first section) -->
haroldcb's avatar
haroldcb committed
62
63
            <xsl:if test="//p[count(preceding::h2)=0 and not(ancestor::td)]">
                <div><xsl:apply-templates select="//p[count(preceding::h2)=0 and not(ancestor::td)]" mode="textOnly"/></div>
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
            </xsl:if>
        </section>
    </xsl:template>

    <!-- Wikipedia sections and subsections -->
    <xsl:template match="h2|h3|h4|h5">
        <section>
            <!-- h3 sections are opale "grains" -->
            <xsl:if test="self::h3">
                <xsl:attribute name="data-hdoc-type">unit-of-content</xsl:attribute>
            </xsl:if>
            <header>
                <h1><xsl:apply-templates select="node()" mode="title"/></h1>
            </header>
            
            <!-- Storing current section to know when apply template has to be called in the next parts of the template -->
            <xsl:variable name="currentSectionTitle" select="." />

82
83
84
85
86
87
88

            <!-- Apply this template for all elements between the actual title h{2,3,4,5} and the next one  -->
            <xsl:variable name="nextTitle" select="following-sibling::*[starts-with(name(), 'h')][1]"/>

            <xsl:apply-templates select="following-sibling::div[contains(@class,'thumb tleft') or contains(@class,'thumb tright')] intersect $nextTitle/preceding-sibling::*"/>


89
90
91
            <!-- If there is text right below the section name, copy it -->
            <xsl:if test="not(following-sibling::*[1] intersect following-sibling::h3) and not(following-sibling::*[1] intersect following-sibling::h4) and not(following-sibling::*[1] intersect following-sibling::h5) and not(following-sibling::*[1] intersect following-sibling::h6)">
                <div>
92
<!--IIICCIIII -->   
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
                    <xsl:for-each select="following-sibling::* intersect following-sibling::h2[1]/preceding-sibling::*">
                        <xsl:if test="not(preceding-sibling::h3 intersect $currentSectionTitle/following-sibling::h3) and not($currentSectionTitle/following-sibling::h4 intersect preceding-sibling::h4) and not($currentSectionTitle/following-sibling::h5 intersect preceding-sibling::h5) and not($currentSectionTitle/following-sibling::h6 intersect preceding-sibling::h6) and not(self::h3) and not(self::h4)  and not(self::h5)  and not(self::h6)">
                            <xsl:apply-templates select="." mode="textOnly"/>
                        </xsl:if>
                    </xsl:for-each>
                </div>
            </xsl:if>
            
            <!-- Applying template of subsections if any -->
            <xsl:choose>
                <xsl:when test="self::h2">
                    <!-- h2 can have h3 subsections -->
                    <xsl:apply-templates select="following-sibling::h3 intersect following-sibling::h2[1]/preceding-sibling::h3"/>
                </xsl:when>
                <xsl:when test="self::h3">
                    <!-- Apply template to h4 subsections of h3. These h4 are below the current h3: previous h3 of these h4 is current h3. -->
                    <xsl:for-each select="following-sibling::h4 intersect following-sibling::h2[1]/preceding-sibling::h4">
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
                <xsl:when test="self::h4">
                    <!-- Apply template aux h5 devant qui sont soit avant un h2, soit un h3, soit un h4 -->
                    <xsl:for-each select="following-sibling::h5 intersect following-sibling::h2[1]/preceding-sibling::h5">
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle/preceding-sibling::h3[1]) and (preceding-sibling::h4[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
                <xsl:when test="self::h5">
                    <!-- Apply template aux h6 devant qui sont soit avant un h2, soit un h3, soit un h4 -->
                    <xsl:for-each select="following-sibling::h6 intersect following-sibling::h2[1]/preceding-sibling::h6">
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle/preceding-sibling::h3[1]) and (preceding-sibling::h4[1] intersect $currentSectionTitle/preceding-sibling::h4[1]) and (preceding-sibling::h5[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
            </xsl:choose>
        </section>
    </xsl:template>
    
    <!-- Wikipedia h6 is not a section in hdoc: div with h6 title -->
    <xsl:template match="h6">
        <div>
            <h6><xsl:apply-templates select="node()" mode="titleh6"/></h6>
            
            <xsl:variable name="currentSectionTitle" select="." />
            <!-- Text of h6 is contained between current h6 and next section title (next h3/h4/h5/h6) -->
            <xsl:for-each select="following-sibling::* intersect following-sibling::h2[1]/preceding-sibling::*">
                <xsl:if test="not(preceding-sibling::h3 intersect $currentSectionTitle/following-sibling::h3) and not($currentSectionTitle/following-sibling::h4 intersect preceding-sibling::h4) and not($currentSectionTitle/following-sibling::h5 intersect preceding-sibling::h5) and not($currentSectionTitle/following-sibling::h6 intersect preceding-sibling::h6) and not(self::h3) and not(self::h4)  and not(self::h5)  and not(self::h6)">
                    <xsl:apply-templates select="." mode="textOnly"/>
                </xsl:if>
            </xsl:for-each>
        </div>
    </xsl:template>

    <!-- Text elements not surrounded by div -->
    <xsl:template match="p|span|i|ul|ol" mode="textOnly">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- Paragraph template -->
    <xsl:template match="p">
        <div>
            <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
                <xsl:apply-templates select="node()" mode="textOnly"/>
            </xsl:element>
        </div>
    </xsl:template>
    
    <!-- li -->
    <xsl:template match="li" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <p><xsl:apply-templates select="node()" mode="textOnly"/></p>
        </xsl:element>
    </xsl:template>
    
    <!-- text followed directly by ul not allowed in li -->
    <xsl:template match="li[descendant::ul]" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <p><xsl:apply-templates select="descendant::node() intersect descendant::ul[1]/preceding-sibling::node()" mode="textOnly"/></p>
            <xsl:apply-templates select="descendant::ul" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- dl/dt/dd in each in a div -->
    <xsl:template match="dl[descendant::dt]" mode="#all">
        <ul><xsl:apply-templates select="dt"/></ul>
    </xsl:template>
    
    <xsl:template match="dl[not(descendant::dt)]" mode="#all">
        <xsl:apply-templates select="dd"/>
    </xsl:template>
    
    <!-- dt contains the title -->
    <xsl:template match="dt">
        <li>
            <p><em><xsl:apply-templates select="node()" mode="titleh6"/></em></p>
            
            <!-- Some dt are followed by multiple dd before next dd: make sure to copy text of each dd before next dt -->
            <xsl:variable name="currentElement" select="."/>
            <xsl:apply-templates select="following-sibling::dd[preceding-sibling::dt[1] intersect $currentElement]"/>
            
            <!-- Some wikipedia articles don't use dd after dt... -->
            <xsl:if test="not(following-sibling::dd[1])">
                <xsl:apply-templates select="../following-sibling::p[1]" mode="textOnly"/>
            </xsl:if>
        </li>
    </xsl:template>
    
    <!-- dd contains the content -->
    <xsl:template match="dd">
        <p><xsl:apply-templates select="node()"/></p>
        <xsl:apply-templates select="ul|ol" mode="textOnly"/>
    </xsl:template>
    
    <!-- Rules for title elements (h1, h2...) -->
    <xsl:template match="*" mode="title">
        <xsl:apply-templates select="node()" mode="title"/>
    </xsl:template>
    
    <xsl:template match="*" mode="titleh6" priority="2">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="b" mode="textOnly">
        <!-- b is not allowed, however em is allowed: replacing all b by em -->
        <xsl:element name="em" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- b outside p, wrap it into a p (and convert it to em) -->
    <xsl:template match="b[not(ancestor::p) and not(ancestor::li) and not(ancestor::a)]" mode="textOnly">
        <p><em><xsl:apply-templates select="node()" mode="textOnly"/></em></p>
    </xsl:template>
    
    <!-- Link elements -->
    <!-- a in title not allowed, only keeping text -->
    <xsl:template match="h2/span/a | h3/span/a | h4/span/a | h5/span/a | h6/span/a" mode="#all">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="a" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:attribute name="href" select="concat('http://wikipedia.org', @href)"/>
            <xsl:value-of select="."/>
        </xsl:element>
    </xsl:template>
    
    <!-- Link elements without actual wikipedia page: keeping only their text -->
    <xsl:template match="a[contains(@class, 'new')]" mode="#all">
        <xsl:value-of select="."/>
    </xsl:template>
    
251
    <!-- Handle Listing -->
252
253
254
255
256
257
258
259
   <xsl:template match="div[@class='mw-highlight mw-content-ltr']" mode="textOnly">
        <xsl:apply-templates select="pre" mode="textOnly"/>
    </xsl:template>
    
    <xsl:template match="pre" mode="textOnly">
        <xsl:variable name="vUid">
            <xsl:number level="any" count="pre"/>
        </xsl:variable>
260
        
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
        <xsl:variable name="listingContent">
            <xsl:copy-of select="@*|node()"/>
        </xsl:variable>
        
        <xsl:variable name="listingContentWithoutBadCharacters">
            <xsl:copy-of select="replace($listingContent,'&amp;','&amp;amp;')"/>
        </xsl:variable>
        
        <xsl:variable name="listingContent2">
            <xsl:value-of select="substring-after($listingContentWithoutBadCharacters,'&#10;')"/>
        </xsl:variable>
        
        <xsl:variable name="basename">
            <xsl:call-template name="substring-before-last">
                <xsl:with-param name="string1" select="$listingContent2" />
                <xsl:with-param name="string2" select="'&#10;'" />
            </xsl:call-template>
        </xsl:variable>
         <object type="application/pdf" data="listing/listing{$vUid}.xml">    <!--Modifier le application/pdf lorsque le rng sera changé-->
            <xsl:result-document href="listing/listing{$vUid}.xml" method="text">
                <xsl:text disable-output-escaping="no">
&lt;sc:item xmlns:sc="http://www.utc.fr/ics/scenari/v3/core"&gt;
	&lt;op:code xmlns:sp="http://www.utc.fr/ics/scenari/v3/primitive" xmlns:op="utc.fr:ics/opale3"&gt;
		&lt;sc:code mimeType="text/plain"&gt;</xsl:text>
                <xsl:value-of select="$basename"/>
                <xsl:text disable-output-escaping="no">&lt;/sc:code&gt;
	&lt;/op:code&gt;
&lt;/sc:item&gt;
                </xsl:text>
290
291
            </xsl:result-document>
        </object>
292
293
294
295
296
    </xsl:template>
    
    <xsl:template name="substring-before-last">
        <xsl:param name="string1" select="''" />
        <xsl:param name="string2" select="''" />
297
        
298
299
300
301
302
303
304
305
306
307
308
309
        <xsl:if test="$string1 != '' and $string2 != ''">
            <xsl:variable name="head" select="substring-before($string1, $string2)" />
            <xsl:variable name="tail" select="substring-after($string1, $string2)" />
            <xsl:value-of select="$head" />
            <xsl:if test="contains($tail, $string2)">
                <xsl:value-of select="$string2" />
                <xsl:call-template name="substring-before-last">
                    <xsl:with-param name="string1" select="$tail" />
                    <xsl:with-param name="string2" select="$string2" />
                </xsl:call-template>
            </xsl:if>
        </xsl:if>
310
311
    </xsl:template>
    
312
  <!--  <xsl:template match="span" mode="textOnly">
313
        <xsl:apply-templates/>
314
    </xsl:template>-->
315
316
    
    
haroldcb's avatar
haroldcb committed
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
    <!-- Simple Tables -->
    <xsl:template match="table" mode="textOnly">
        <xsl:choose>
            <xsl:when test="not(descendant::td/@colspan | descendant::td/@rowspan | descendant::table)">
                <table>
                    <xsl:apply-templates select="node()" mode="textOnly"/>        
                </table>
            </xsl:when> 
            <xsl:otherwise>
                <xsl:call-template name="TableComplexe"/>
            </xsl:otherwise>
        </xsl:choose>               
    </xsl:template>
    
    <xsl:template name="TableComplexe">
        <xsl:variable name="tableId">
            <xsl:number level="any" count="table"/>
        </xsl:variable>
        <object type="application/vnd.oasis.opendocument.spreadsheet" data="tables/table{$tableId}.ods">
            <xsl:result-document href="tables/table{$tableId}.ods" method="html" encoding="UTF-8">
                <xsl:element name="table">
                    <xsl:copy-of select="@*|node()"></xsl:copy-of>
                </xsl:element>                
            </xsl:result-document>
        </object>
    </xsl:template>
    
    <xsl:template match="caption" mode="textOnly">
        <caption><xsl:value-of select="node()"/></caption>
    </xsl:template>
    
    <xsl:template match="tr" mode="textOnly">
        <tr>
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </tr>
    </xsl:template>
    
    <xsl:template match="td | th" mode="textOnly">
        <td><p><xsl:apply-templates select="." mode="table"/></p></td>
    </xsl:template>
    
    <xsl:template match="b" mode="table">
        <!-- b is not allowed, however em is allowed: replacing all b by em -->
        <xsl:element name="em" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- Balises pour les abbréviations -->
    <xsl:template match="abbr" mode="table" priority="1">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="sup" mode="table" priority="1">
        <sup><xsl:value-of select="."/></sup>
    </xsl:template>
    
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391


    <!-- Images -->
    <xsl:template match="div[contains(@class,'thumb')]">
        <div> 
            <xsl:apply-templates select=".//img"/>
            <xsl:apply-templates select=".//p[@class='thumbcaption']"/>
        </div>
    </xsl:template>  

    <xsl:template match="img">
     <img src="./ressources/{tokenize(@src, '/')[last()]}" alt="{@alt}"/>
    </xsl:template>

    <xsl:template match="p[@class='thumbcaption']">
        <p> <xsl:value-of select="text()"/> </p>
    </xsl:template>

haroldcb's avatar
haroldcb committed
392
    
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
    <!-- ===== Ignored content ===== -->
    
    <!-- Only keeping a with information: we give up page references -->
    <xsl:template match="a[starts-with(@href, '#')]" mode="#all"/>
    
    <!-- Removing Wikipedia internal sup, they are not useful to us (sup are "cite source / reference" etc...) -->
    <xsl:template match="sup" mode="#all"/>
    
    <!-- Ignoring empty text elements only relevant to Wikipedia -->
    <xsl:template match="p[empty(node())]" mode="#all" priority="2"/>
    <xsl:template match="span[contains(@class, 'mw-edit')]" mode="#all" priority="2"/>
    
    <!-- Ignoring table of contents h2 -->
    <xsl:template match="div[@id='toctitle']/h2" priority="1"/>
    
    <!-- Ignoring end of file h2: see also, etc... -->
    <xsl:template match="h2[not(following-sibling::p intersect following-sibling::h2[1]/preceding-sibling::p)]"/>
    
    
    <!-- Ignoring divs by default: they are not relevant to us -->
    <xsl:template match="div" mode="#all"/>
</xsl:stylesheet>