wiki_to_hdoc.xsl 15.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    exclude-result-prefixes="xs"
    version="2.0"
    xmlns="http://www.utc.fr/ics/hdoc/xhtml">
    
    <xsl:output method="xhtml" indent="yes"/>
    
    <xsl:template match="*"/>
    <xsl:template match="text()">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="html">
        <!-- Schema link -->
        <xsl:processing-instruction name="oxygen">RNGSchema="http://scenari.utc.fr/hdoc/schemas/xhtml/hdoc1-xhtml.rng" type="xml"</xsl:processing-instruction>
        <xsl:text>&#10;</xsl:text>
        
        <!-- html content -->
        <html>
            <head>
                <xsl:apply-templates select="head"/>
            </head>
            <body>
                <xsl:apply-templates select="body"/>
            </body>
        </html>
    </xsl:template>
    
    <!-- Head template -->
    <xsl:template match="head">
        <xsl:apply-templates select="title"/>
        <meta charset="utf-8" />
        <meta name="generator" content="HdocConverter/wikipedia"/>
        <meta name="author" content="Wikipedia"/>
    </xsl:template>
    
    <!-- Body template -->
    <xsl:template match="body">
        <!-- If there is h1 title with firstHeading class, this has to be the first section title -->
        <xsl:apply-templates select="//h1[1]" mode="introduction"/>
        
        <!-- Next are all parts of the document: h2 -->
        <xsl:apply-templates select="//h2"/>
    </xsl:template>
    
    <xsl:template match="title">
        <title><xsl:value-of select="."/></title>
    </xsl:template>
    
    <!-- Beginning of the wikipedia page: introduction paragraph -->
    <xsl:template match="h1[1]" mode="introduction">
        <section data-hdoc-type="introduction">
            <header>
                <h1>Introduction</h1>
            </header>
            
            <!-- Introduction text before first h2 (first section) -->
haroldcb's avatar
haroldcb committed
60
61
            <xsl:if test="//p[count(preceding::h2)=0 and not(ancestor::td)]">
                <div><xsl:apply-templates select="//p[count(preceding::h2)=0 and not(ancestor::td)]" mode="textOnly"/></div>
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
            </xsl:if>
        </section>
    </xsl:template>

    <!-- Wikipedia sections and subsections -->
    <xsl:template match="h2|h3|h4|h5">
        <section>
            <!-- h3 sections are opale "grains" -->
            <xsl:if test="self::h3">
                <xsl:attribute name="data-hdoc-type">unit-of-content</xsl:attribute>
            </xsl:if>
            <header>
                <h1><xsl:apply-templates select="node()" mode="title"/></h1>
            </header>
            
            <!-- Storing current section to know when apply template has to be called in the next parts of the template -->
            <xsl:variable name="currentSectionTitle" select="." />

            <!-- If there is text right below the section name, copy it -->
            <xsl:if test="not(following-sibling::*[1] intersect following-sibling::h3) and not(following-sibling::*[1] intersect following-sibling::h4) and not(following-sibling::*[1] intersect following-sibling::h5) and not(following-sibling::*[1] intersect following-sibling::h6)">
                <div>
83
<!--IIICCIIII -->   
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
                    <xsl:for-each select="following-sibling::* intersect following-sibling::h2[1]/preceding-sibling::*">
                        <xsl:if test="not(preceding-sibling::h3 intersect $currentSectionTitle/following-sibling::h3) and not($currentSectionTitle/following-sibling::h4 intersect preceding-sibling::h4) and not($currentSectionTitle/following-sibling::h5 intersect preceding-sibling::h5) and not($currentSectionTitle/following-sibling::h6 intersect preceding-sibling::h6) and not(self::h3) and not(self::h4)  and not(self::h5)  and not(self::h6)">
                            <xsl:apply-templates select="." mode="textOnly"/>
                        </xsl:if>
                    </xsl:for-each>
                </div>
            </xsl:if>
            
            <!-- Applying template of subsections if any -->
            <xsl:choose>
                <xsl:when test="self::h2">
                    <!-- h2 can have h3 subsections -->
                    <xsl:apply-templates select="following-sibling::h3 intersect following-sibling::h2[1]/preceding-sibling::h3"/>
                </xsl:when>
                <xsl:when test="self::h3">
                    <!-- Apply template to h4 subsections of h3. These h4 are below the current h3: previous h3 of these h4 is current h3. -->
                    <xsl:for-each select="following-sibling::h4 intersect following-sibling::h2[1]/preceding-sibling::h4">
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
                <xsl:when test="self::h4">
                    <!-- Apply template aux h5 devant qui sont soit avant un h2, soit un h3, soit un h4 -->
                    <xsl:for-each select="following-sibling::h5 intersect following-sibling::h2[1]/preceding-sibling::h5">
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle/preceding-sibling::h3[1]) and (preceding-sibling::h4[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
                <xsl:when test="self::h5">
                    <!-- Apply template aux h6 devant qui sont soit avant un h2, soit un h3, soit un h4 -->
                    <xsl:for-each select="following-sibling::h6 intersect following-sibling::h2[1]/preceding-sibling::h6">
                        <xsl:if test="(preceding-sibling::h3[1] intersect $currentSectionTitle/preceding-sibling::h3[1]) and (preceding-sibling::h4[1] intersect $currentSectionTitle/preceding-sibling::h4[1]) and (preceding-sibling::h5[1] intersect $currentSectionTitle)">
                            <xsl:apply-templates select="."/>
                        </xsl:if>
                    </xsl:for-each>
                </xsl:when>
            </xsl:choose>
        </section>
    </xsl:template>
    
    <!-- Wikipedia h6 is not a section in hdoc: div with h6 title -->
    <xsl:template match="h6">
        <div>
            <h6><xsl:apply-templates select="node()" mode="titleh6"/></h6>
            
            <xsl:variable name="currentSectionTitle" select="." />
            <!-- Text of h6 is contained between current h6 and next section title (next h3/h4/h5/h6) -->
            <xsl:for-each select="following-sibling::* intersect following-sibling::h2[1]/preceding-sibling::*">
                <xsl:if test="not(preceding-sibling::h3 intersect $currentSectionTitle/following-sibling::h3) and not($currentSectionTitle/following-sibling::h4 intersect preceding-sibling::h4) and not($currentSectionTitle/following-sibling::h5 intersect preceding-sibling::h5) and not($currentSectionTitle/following-sibling::h6 intersect preceding-sibling::h6) and not(self::h3) and not(self::h4)  and not(self::h5)  and not(self::h6)">
                    <xsl:apply-templates select="." mode="textOnly"/>
                </xsl:if>
            </xsl:for-each>
        </div>
    </xsl:template>

    <!-- Text elements not surrounded by div -->
    <xsl:template match="p|span|i|ul|ol" mode="textOnly">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- Paragraph template -->
    <xsl:template match="p">
        <div>
            <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
                <xsl:apply-templates select="node()" mode="textOnly"/>
            </xsl:element>
        </div>
    </xsl:template>
    
    <!-- li -->
    <xsl:template match="li" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <p><xsl:apply-templates select="node()" mode="textOnly"/></p>
        </xsl:element>
    </xsl:template>
    
    <!-- text followed directly by ul not allowed in li -->
    <xsl:template match="li[descendant::ul]" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <p><xsl:apply-templates select="descendant::node() intersect descendant::ul[1]/preceding-sibling::node()" mode="textOnly"/></p>
            <xsl:apply-templates select="descendant::ul" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- dl/dt/dd in each in a div -->
    <xsl:template match="dl[descendant::dt]" mode="#all">
        <ul><xsl:apply-templates select="dt"/></ul>
    </xsl:template>
    
    <xsl:template match="dl[not(descendant::dt)]" mode="#all">
        <xsl:apply-templates select="dd"/>
    </xsl:template>
    
    <!-- dt contains the title -->
    <xsl:template match="dt">
        <li>
            <p><em><xsl:apply-templates select="node()" mode="titleh6"/></em></p>
            
            <!-- Some dt are followed by multiple dd before next dd: make sure to copy text of each dd before next dt -->
            <xsl:variable name="currentElement" select="."/>
            <xsl:apply-templates select="following-sibling::dd[preceding-sibling::dt[1] intersect $currentElement]"/>
            
            <!-- Some wikipedia articles don't use dd after dt... -->
            <xsl:if test="not(following-sibling::dd[1])">
                <xsl:apply-templates select="../following-sibling::p[1]" mode="textOnly"/>
            </xsl:if>
        </li>
    </xsl:template>
    
    <!-- dd contains the content -->
    <xsl:template match="dd">
        <p><xsl:apply-templates select="node()"/></p>
        <xsl:apply-templates select="ul|ol" mode="textOnly"/>
    </xsl:template>
    
    <!-- Rules for title elements (h1, h2...) -->
    <xsl:template match="*" mode="title">
        <xsl:apply-templates select="node()" mode="title"/>
    </xsl:template>
    
    <xsl:template match="*" mode="titleh6" priority="2">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="b" mode="textOnly">
        <!-- b is not allowed, however em is allowed: replacing all b by em -->
        <xsl:element name="em" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- b outside p, wrap it into a p (and convert it to em) -->
    <xsl:template match="b[not(ancestor::p) and not(ancestor::li) and not(ancestor::a)]" mode="textOnly">
        <p><em><xsl:apply-templates select="node()" mode="textOnly"/></em></p>
    </xsl:template>
    
    <!-- Link elements -->
    <!-- a in title not allowed, only keeping text -->
    <xsl:template match="h2/span/a | h3/span/a | h4/span/a | h5/span/a | h6/span/a" mode="#all">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="a" mode="#all">
        <xsl:element name="{local-name()}" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:attribute name="href" select="concat('http://wikipedia.org', @href)"/>
            <xsl:value-of select="."/>
        </xsl:element>
    </xsl:template>
    
    <!-- Link elements without actual wikipedia page: keeping only their text -->
    <xsl:template match="a[contains(@class, 'new')]" mode="#all">
        <xsl:value-of select="."/>
    </xsl:template>
    
242
243
244
245
246
247
    <!-- Listing -->
   <xsl:template match="div[@class='mw-highlight mw-content-ltr']" mode="textOnly">
        <xsl:apply-templates select="pre" mode="textOnly"/>
    </xsl:template>
    
    <xsl:template match="pre" mode="textOnly">
248
        
249
250
251
        <xsl:variable name="vUid">
            <xsl:number level="any" count="pre"/>
        </xsl:variable>
252
253
        
         <object type="application/pdf" data="listing/listing{$vUid}.txt">    <!--Modifier le application/pdf lorsque le rng sera changé-->
254
255
256
257
            <xsl:result-document href="listing/listing{$vUid}.txt" method="text">
                <xsl:copy-of select="@*|node()"></xsl:copy-of>
            </xsl:result-document>
        </object>
258
        
259
260
261
262
263
264
265
    </xsl:template>
    
    <xsl:template match="span" mode="textOnly">
        <xsl:apply-templates/>
    </xsl:template>
    
    
haroldcb's avatar
haroldcb committed
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
    <!-- Simple Tables -->
    <xsl:template match="table" mode="textOnly">
        <xsl:choose>
            <xsl:when test="not(descendant::td/@colspan | descendant::td/@rowspan | descendant::table)">
                <table>
                    <xsl:apply-templates select="node()" mode="textOnly"/>        
                </table>
            </xsl:when> 
            <xsl:otherwise>
                <xsl:call-template name="TableComplexe"/>
            </xsl:otherwise>
        </xsl:choose>               
    </xsl:template>
    
    <xsl:template name="TableComplexe">
        <xsl:variable name="tableId">
            <xsl:number level="any" count="table"/>
        </xsl:variable>
        <object type="application/vnd.oasis.opendocument.spreadsheet" data="tables/table{$tableId}.ods">
            <xsl:result-document href="tables/table{$tableId}.ods" method="html" encoding="UTF-8">
                <xsl:element name="table">
                    <xsl:copy-of select="@*|node()"></xsl:copy-of>
                </xsl:element>                
            </xsl:result-document>
        </object>
    </xsl:template>
    
    <xsl:template match="caption" mode="textOnly">
        <caption><xsl:value-of select="node()"/></caption>
    </xsl:template>
    
    <xsl:template match="tr" mode="textOnly">
        <tr>
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </tr>
    </xsl:template>
    
    <xsl:template match="td | th" mode="textOnly">
        <td><p><xsl:apply-templates select="." mode="table"/></p></td>
    </xsl:template>
    
    <xsl:template match="b" mode="table">
        <!-- b is not allowed, however em is allowed: replacing all b by em -->
        <xsl:element name="em" namespace="http://www.utc.fr/ics/hdoc/xhtml">
            <xsl:apply-templates select="node()" mode="textOnly"/>
        </xsl:element>
    </xsl:template>
    
    <!-- Balises pour les abbréviations -->
    <xsl:template match="abbr" mode="table" priority="1">
        <xsl:value-of select="."/>
    </xsl:template>
    
    <xsl:template match="sup" mode="table" priority="1">
        <sup><xsl:value-of select="."/></sup>
    </xsl:template>
    
    
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
    <!-- ===== Ignored content ===== -->
    
    <!-- Only keeping a with information: we give up page references -->
    <xsl:template match="a[starts-with(@href, '#')]" mode="#all"/>
    
    <!-- Removing Wikipedia internal sup, they are not useful to us (sup are "cite source / reference" etc...) -->
    <xsl:template match="sup" mode="#all"/>
    
    <!-- Ignoring empty text elements only relevant to Wikipedia -->
    <xsl:template match="p[empty(node())]" mode="#all" priority="2"/>
    <xsl:template match="span[contains(@class, 'mw-edit')]" mode="#all" priority="2"/>
    
    <!-- Ignoring table of contents h2 -->
    <xsl:template match="div[@id='toctitle']/h2" priority="1"/>
    
    <!-- Ignoring end of file h2: see also, etc... -->
    <xsl:template match="h2[not(following-sibling::p intersect following-sibling::h2[1]/preceding-sibling::p)]"/>
    
    
    <!-- Ignoring divs by default: they are not relevant to us -->
    <xsl:template match="div" mode="#all"/>
</xsl:stylesheet>