wiki_to_hdoc.xsl 19.8 KB
Newer Older
1
2
3
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
4
    xmlns:functx="http://www.functx.com"
5
6
7
8
9
    exclude-result-prefixes="xs"
    version="2.0"
    xmlns="http://www.utc.fr/ics/hdoc/xhtml">
    
    <xsl:output method="xhtml" indent="yes"/>
10
11
    <xsl:preserve-space elements="pre"/>
        
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
    <xsl:template match="*"/>
    <xsl:template match="text()">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="html">
        <!-- Schema link -->
        <xsl:processing-instruction name="oxygen">RNGSchema="http://scenari.utc.fr/hdoc/schemas/xhtml/hdoc1-xhtml.rng" type="xml"</xsl:processing-instruction>
        <xsl:text>&#10;</xsl:text>
        
        <!-- html content -->
        <html>
            <head>
                <xsl:apply-templates select="head"/>
            </head>
            <body>
                <xsl:apply-templates select="body"/>
            </body>
        </html>
    </xsl:template>
    
    <!-- Head template -->
    <xsl:template match="head">
        <xsl:apply-templates select="title"/>
        <meta charset="utf-8" />
        <meta name="generator" content="HdocConverter/wikipedia"/>
        <meta name="author" content="Wikipedia"/>
    </xsl:template>
    
    <!-- Body template -->
    <xsl:template match="body">
        <!-- If there is h1 title with firstHeading class, this has to be the first section title -->
        <xsl:apply-templates select="//h1[1]" mode="introduction"/>
        
        <!-- Next are all parts of the document: h2 -->
        <xsl:apply-templates select="//h2"/>
    </xsl:template>
    
    <xsl:template match="title">
        <title><xsl:value-of select="."/></title>
    </xsl:template>
    
    <!-- Beginning of the wikipedia page: introduction paragraph -->
    <xsl:template match="h1[1]" mode="introduction">
        <section data-hdoc-type="introduction">
            <header>
                <h1>Introduction</h1>
            </header>
            
            <!-- Introduction text before first h2 (first section) -->
haroldcb's avatar
haroldcb committed
62
63
            <xsl:if test="//p[count(preceding::h2)=0 and not(ancestor::td)]">
                <div><xsl:apply-templates select="//p[count(preceding::h2)=0 and not(ancestor::td)]" mode="textOnly"/></div>
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
            </xsl:if>
        </section>
    </xsl:template>

    <!-- Wikipedia sections and subsections -->
    <xsl:template match="h2|h3|h4|h5">
        <section>
            <!-- h3 sections are opale "grains" -->
            <xsl:if test="self::h3">
                <xsl:attribute name="data-hdoc-type">unit-of-content</xsl:attribute>
            </xsl:if>
            <header>
                <h1><xsl:apply-templates select="node()" mode="title"/></h1>
            </header>
            
            <!-- Storing current section to know when apply template has to be called in the next parts of the template -->
            <xsl:variable name="currentSectionTitle" select="." />

82
83
84
85
86
87

            <!-- Apply this template for all elements between the actual title h{2,3,4,5} and the next one  -->
            <xsl:variable name="nextTitle" select="following-sibling::*[starts-with(name(), 'h')][1]"/>

            <xsl:apply-templates select="following-sibling::div[contains(@class,'thumb tleft') or contains(@class,'thumb tright')] intersect $nextTitle/preceding-sibling::*"/>

88
<!--             <xsl:apply-templates select="following-sibling::p/img"/>-->
89
90
91
            <!-- If there is text right below the section name, copy it -->
            <xsl:if test="not(following-sibling::*[1] intersect following-sibling::h3) and not(following-sibling::*[1] intersect following-sibling::h4) and not(following-sibling::*[1] intersect following-sibling::h5) and not(following-sibling::*[1] intersect following-sibling::h6)">
                <div>
92
<!--IIICCIIII -->   
93
94
95
                    <xsl:for-each select="following-sibling::* intersect following-sibling::h2[1]/preceding-sibling::*">
                        <xsl:if test="not(preceding-sibling::h3 intersect $currentSectionTitle/following-sibling::h3) and not($currentSectionTitle/following-sibling::h4 intersect preceding-sibling::h4) and not($currentSectionTitle/following-sibling::h5 intersect preceding-sibling::h5) and not($currentSectionTitle/following-sibling::h6 intersect preceding-sibling::h6) and not(self::h3) and not(self::h4)  and not(self::h5)  and not(self::h6)">
                            <xsl:apply-templates select="." mode="textOnly"/>
96
                             <!--To add images it took me almost 1h and a half I believe strongly that the structure of this page can be changed  and made easy !!  -->
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
                        </xsl:if>
                    </xsl:for-each>
                </div>
            </xsl:if>
            
            <!-- Applying template of subsections if any -->
            <xsl:choose>
                <xsl:when test="self::h2">
                    <!-- h2 can have h3 subsections -->
                    <xsl:apply-templates select="following-sibling::h3 intersect following-sibling::h2[1]/preceding-sibling::h3"/>
                </xsl:when>
                <xsl:when test="self::h3">
                    <!-- Apply template to h4 subsections of h3. These h4 are below the current h3: previous h3 of these h4 is current h3. -->
                    <xsl:for-each select="following-sibling::h4 intersect following-sibling::h2[1]/preceding-sibling::h4">
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
                <xsl:when test="self::h4">
                    <!-- Apply template aux h5 devant qui sont soit avant un h2, soit un h3, soit un h4 -->
                    <xsl:for-each select="following-sibling::h5 intersect following-sibling::h2[1]/preceding-sibling::h5">
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle/preceding-sibling::h3[1]) and (preceding-sibling::h4[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
                <xsl:when test="self::h5">
                    <!-- Apply template aux h6 devant qui sont soit avant un h2, soit un h3, soit un h4 -->
                    <xsl:for-each select="following-sibling::h6 intersect following-sibling::h2[1]/preceding-sibling::h6">
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle/preceding-sibling::h3[1]) and (preceding-sibling::h4[1] intersect $currentSectionTitle/preceding-sibling::h4[1]) and (preceding-sibling::h5[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
            </xsl:choose>
        </section>
    </xsl:template>
    
    <!-- Wikipedia h6 is not a section in hdoc: div with h6 title -->
    <xsl:template match="h6">
        <div>
            <h6><xsl:apply-templates select="node()" mode="titleh6"/></h6>
            
            <xsl:variable name="currentSectionTitle" select="." />
            <!-- Text of h6 is contained between current h6 and next section title (next h3/h4/h5/h6) -->
            <xsl:for-each select="following-sibling::* intersect following-sibling::h2[1]/preceding-sibling::*">
                <xsl:if test="not(preceding-sibling::h3 intersect $currentSectionTitle/following-sibling::h3) and not($currentSectionTitle/following-sibling::h4 intersect preceding-sibling::h4) and not($currentSectionTitle/following-sibling::h5 intersect preceding-sibling::h5) and not($currentSectionTitle/following-sibling::h6 intersect preceding-sibling::h6) and not(self::h3) and not(self::h4)  and not(self::h5)  and not(self::h6)">
                    <xsl:apply-templates select="." mode="textOnly"/>
                </xsl:if>
            </xsl:for-each>
        </div>
    </xsl:template>

    <!-- Text elements not surrounded by div -->
152
    <xsl:template match="p|span|i|ul|ol" mode="textOnly" >
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- Paragraph template -->
    <xsl:template match="p">
        <div>
            <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
                <xsl:apply-templates select="node()" mode="textOnly"/>
            </xsl:element>
        </div>
    </xsl:template>
    
    <!-- li -->
    <xsl:template match="li" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <p><xsl:apply-templates select="node()" mode="textOnly"/></p>
        </xsl:element>
    </xsl:template>
    
    <!-- text followed directly by ul not allowed in li -->
    <xsl:template match="li[descendant::ul]" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <p><xsl:apply-templates select="descendant::node() intersect descendant::ul[1]/preceding-sibling::node()" mode="textOnly"/></p>
            <xsl:apply-templates select="descendant::ul" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- dl/dt/dd in each in a div -->
    <xsl:template match="dl[descendant::dt]" mode="#all">
        <ul><xsl:apply-templates select="dt"/></ul>
    </xsl:template>
    
    <xsl:template match="dl[not(descendant::dt)]" mode="#all">
        <xsl:apply-templates select="dd"/>
    </xsl:template>
    
    <!-- dt contains the title -->
    <xsl:template match="dt">
        <li>
            <p><em><xsl:apply-templates select="node()" mode="titleh6"/></em></p>
            
            <!-- Some dt are followed by multiple dd before next dd: make sure to copy text of each dd before next dt -->
            <xsl:variable name="currentElement" select="."/>
            <xsl:apply-templates select="following-sibling::dd[preceding-sibling::dt[1] intersect $currentElement]"/>
            
            <!-- Some wikipedia articles don't use dd after dt... -->
            <xsl:if test="not(following-sibling::dd[1])">
                <xsl:apply-templates select="../following-sibling::p[1]" mode="textOnly"/>
            </xsl:if>
        </li>
    </xsl:template>
    
    <!-- dd contains the content -->
    <xsl:template match="dd">
        <p><xsl:apply-templates select="node()"/></p>
        <xsl:apply-templates select="ul|ol" mode="textOnly"/>
    </xsl:template>
    
    <!-- Rules for title elements (h1, h2...) -->
    <xsl:template match="*" mode="title">
        <xsl:apply-templates select="node()" mode="title"/>
    </xsl:template>
    
    <xsl:template match="*" mode="titleh6" priority="2">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="b" mode="textOnly">
        <!-- b is not allowed, however em is allowed: replacing all b by em -->
        <xsl:element name="em" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- b outside p, wrap it into a p (and convert it to em) -->
    <xsl:template match="b[not(ancestor::p) and not(ancestor::li) and not(ancestor::a)]" mode="textOnly">
        <p><em><xsl:apply-templates select="node()" mode="textOnly"/></em></p>
    </xsl:template>
    
    <!-- Link elements -->
    <!-- a in title not allowed, only keeping text -->
    <xsl:template match="h2/span/a | h3/span/a | h4/span/a | h5/span/a | h6/span/a" mode="#all">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="a" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:attribute name="href" select="concat('http://wikipedia.org', @href)"/>
            <xsl:value-of select="."/>
        </xsl:element>
    </xsl:template>
    
    <!-- Link elements without actual wikipedia page: keeping only their text -->
    <xsl:template match="a[contains(@class, 'new')]" mode="#all">
        <xsl:value-of select="."/>
    </xsl:template>
    
252
    <!-- Handle Listing -->
253
254
255
256
257
258
259
260
   <xsl:template match="div[@class='mw-highlight mw-content-ltr']" mode="textOnly">
        <xsl:apply-templates select="pre" mode="textOnly"/>
    </xsl:template>
    
    <xsl:template match="pre" mode="textOnly">
        <xsl:variable name="vUid">
            <xsl:number level="any" count="pre"/>
        </xsl:variable>
261
        
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
        <xsl:variable name="listingContent">
            <xsl:copy-of select="@*|node()"/>
        </xsl:variable>
        
        <xsl:variable name="listingContentWithoutBadCharacters">
            <xsl:copy-of select="replace($listingContent,'&amp;','&amp;amp;')"/>
        </xsl:variable>
        
        <xsl:variable name="listingContent2">
            <xsl:value-of select="substring-after($listingContentWithoutBadCharacters,'&#10;')"/>
        </xsl:variable>
        
        <xsl:variable name="basename">
            <xsl:call-template name="substring-before-last">
                <xsl:with-param name="string1" select="$listingContent2" />
                <xsl:with-param name="string2" select="'&#10;'" />
            </xsl:call-template>
        </xsl:variable>
280
         <object type="application/x-listing" data="listing/listing{$vUid}.xml">    <!--Modifier le application/pdf lorsque le rng sera changé-->
281
282
283
284
285
286
287
288
289
290
            <xsl:result-document href="listing/listing{$vUid}.xml" method="text">
                <xsl:text disable-output-escaping="no">
&lt;sc:item xmlns:sc="http://www.utc.fr/ics/scenari/v3/core"&gt;
	&lt;op:code xmlns:sp="http://www.utc.fr/ics/scenari/v3/primitive" xmlns:op="utc.fr:ics/opale3"&gt;
		&lt;sc:code mimeType="text/plain"&gt;</xsl:text>
                <xsl:value-of select="$basename"/>
                <xsl:text disable-output-escaping="no">&lt;/sc:code&gt;
	&lt;/op:code&gt;
&lt;/sc:item&gt;
                </xsl:text>
291
292
            </xsl:result-document>
        </object>
293
294
295
296
297
    </xsl:template>
    
    <xsl:template name="substring-before-last">
        <xsl:param name="string1" select="''" />
        <xsl:param name="string2" select="''" />
298
        
299
300
301
302
303
304
305
306
307
308
309
310
        <xsl:if test="$string1 != '' and $string2 != ''">
            <xsl:variable name="head" select="substring-before($string1, $string2)" />
            <xsl:variable name="tail" select="substring-after($string1, $string2)" />
            <xsl:value-of select="$head" />
            <xsl:if test="contains($tail, $string2)">
                <xsl:value-of select="$string2" />
                <xsl:call-template name="substring-before-last">
                    <xsl:with-param name="string1" select="$tail" />
                    <xsl:with-param name="string2" select="$string2" />
                </xsl:call-template>
            </xsl:if>
        </xsl:if>
311
312
313
    </xsl:template>
    
    
haroldcb's avatar
haroldcb committed
314
315
316
    <!-- Simple Tables -->
    <xsl:template match="table" mode="textOnly">
        <xsl:choose>
317
            <!-- Tables that doesn't contain colspan, rowspan or included tables -->
haroldcb's avatar
haroldcb committed
318
319
320
321
322
323
324
325
326
327
328
            <xsl:when test="not(descendant::td/@colspan | descendant::td/@rowspan | descendant::table)">
                <table>
                    <xsl:apply-templates select="node()" mode="textOnly"/>        
                </table>
            </xsl:when> 
            <xsl:otherwise>
                <xsl:call-template name="TableComplexe"/>
            </xsl:otherwise>
        </xsl:choose>               
    </xsl:template>
    
329
    <!-- Complexe Tables : create extern Ods files and copy content in it-->
haroldcb's avatar
haroldcb committed
330
331
332
333
334
    <xsl:template name="TableComplexe">
        <xsl:variable name="tableId">
            <xsl:number level="any" count="table"/>
        </xsl:variable>
        <object type="application/vnd.oasis.opendocument.spreadsheet" data="tables/table{$tableId}.ods">
haroldcb's avatar
haroldcb committed
335
            <xsl:result-document href="tables/table{$tableId}.ods" method="html" encoding="utf-8">
haroldcb's avatar
haroldcb committed
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
                <xsl:element name="table">
                    <xsl:copy-of select="@*|node()"></xsl:copy-of>
                </xsl:element>                
            </xsl:result-document>
        </object>
    </xsl:template>
    
    <xsl:template match="caption" mode="textOnly">
        <caption><xsl:value-of select="node()"/></caption>
    </xsl:template>
    
    <xsl:template match="tr" mode="textOnly">
        <tr>
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </tr>
    </xsl:template>
    
    <xsl:template match="td | th" mode="textOnly">
354
355
356
357
358
359
360
361
        <xsl:choose>
            <xsl:when test="not(ul|ol)">
                <td><p><xsl:apply-templates select="." mode="table"/></p></td>
            </xsl:when> 
            <xsl:otherwise>
                <td><xsl:apply-templates select="." mode="table"/></td>
            </xsl:otherwise>
        </xsl:choose>       
haroldcb's avatar
haroldcb committed
362
363
    </xsl:template>
    
haroldcb's avatar
haroldcb committed
364
365
    <xsl:template match="a[@class='image']" mode="table"/>    
    
366
    <xsl:template match="b" mode="table" priority="1">
haroldcb's avatar
haroldcb committed
367
368
369
370
371
372
        <!-- b is not allowed, however em is allowed: replacing all b by em -->
        <xsl:element name="em" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
haroldcb's avatar
haroldcb committed
373
374
    <xsl:template match="td/span[contains(@class, 'mw-edit')]" mode="table"/>
    
haroldcb's avatar
haroldcb committed
375
376
    <!-- Balises pour les abbréviations -->
    <xsl:template match="abbr" mode="table" priority="1">
377
        <xsl:apply-templates select="node()" mode="textOnly"/>
haroldcb's avatar
haroldcb committed
378
379
    </xsl:template>
    
380
    <xsl:template match="abbr/sup" mode="table" priority="1">
haroldcb's avatar
haroldcb committed
381
382
383
        <sup><xsl:value-of select="."/></sup>
    </xsl:template>
    
384
385
386
387
388
389
    <xsl:template match="ul|ol" mode="table" priority="1">
        <xsl:element name="{local-name()}">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
390
391
392
393
394


    <!-- Images -->
    <xsl:template match="div[contains(@class,'thumb')]">
        <div> 
395
            <xsl:apply-templates select=".//img" mode="textOnly"/>
396
397
398
399
            <xsl:apply-templates select=".//p[@class='thumbcaption']"/>
        </div>
    </xsl:template>  

400
401
    <xsl:template match="img" mode="textOnly">
    <img src="./ressources/{translate(tokenize(@src, '/')[last()],'?%','_')}" alt="{@alt}"/>
402
403
    </xsl:template>

404

405
406
407
408
    <xsl:template match="p[@class='thumbcaption']">
        <p> <xsl:value-of select="text()"/> </p>
    </xsl:template>

haroldcb's avatar
haroldcb committed
409
    
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
    <!-- ===== Ignored content ===== -->
    
    <!-- Only keeping a with information: we give up page references -->
    <xsl:template match="a[starts-with(@href, '#')]" mode="#all"/>
    
    <!-- Removing Wikipedia internal sup, they are not useful to us (sup are "cite source / reference" etc...) -->
    <xsl:template match="sup" mode="#all"/>
    
    <!-- Ignoring empty text elements only relevant to Wikipedia -->
    <xsl:template match="p[empty(node())]" mode="#all" priority="2"/>
    <xsl:template match="span[contains(@class, 'mw-edit')]" mode="#all" priority="2"/>
    
    <!-- Ignoring table of contents h2 -->
    <xsl:template match="div[@id='toctitle']/h2" priority="1"/>
    
    <!-- Ignoring end of file h2: see also, etc... -->
426
427
<!--    <xsl:template match="h2[not(following-sibling::p intersect following-sibling::h2[1]/preceding-sibling::p)]"/>-->
    <xsl:template match=" h2[following-sibling::ul[li/a[@class='external text'] and preceding-sibling::h2]][last()]"/>
428
429
    <xsl:template match=" h2[following-sibling::div[@class='references-small decimal'] and preceding-sibling::h2][last()]"></xsl:template>
        <xsl:template match="h2[following-sibling::h3[span[@id='Notes']]][last()]"/>
430
    <xsl:template match="div[@id='mw-navigation']/h2"/>
431
432
433
434
    
    <!-- Ignoring divs by default: they are not relevant to us -->
    <xsl:template match="div" mode="#all"/>
</xsl:stylesheet>