normalization.pl 3.08 KB
Newer Older
Etienne Chognard's avatar
Update  
Etienne Chognard committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/perl

($InputPath) = @ARGV;

use FindBin;

# OPENING INPUT AN OUTPUT FILES
open(FICH_IN, '<', $InputPath) or die $!;
open(FICH_OUT, '+>', $FindBin::Bin . "/../tmp/intermediaire.xml") or die $! . $FindBin::Bin . "/../tmp/intermediaire.xml";

$insideMeta = 0;
$insideImportant = 0;
$insideExemple = 0;
$insideDefinition = 0;
$insideAttention = 0;

print FICH_OUT '<!-- <?oxygen RNGSchema="http://scenari.utc.fr/hdoc/schemas/xhtml/hdoc1-xhtml.rng" type="xml"?> -->';

while($_ = <FICH_IN>)
{
  #print "\n current line : " . $_;

  $isDone = 0;
  
  # REMOVE NON-BREAKABLE SPACES AT END OF LINES (USELESS)
  while ($_ =~s/&nbsp;//s) {
    $_ =~s/&nbsp;//s;
  }

  # REPLACE BROKEN <br tags by <br/>
  while ($_ =~ s/<br$//) {
    $_ = $_ . "<br/>";
  }

  # REMOVE ends of broken br tags
  if ($_ =~ /^\/>/) {
    $_ =~ s/\/>//;
  }

  # REMOVE EXISTING DOCTYPE AND META
  if ($_ =~ /<!DOCTYPE|<meta|<META/) {
    $isDone = 1;
  }
    # REPLACE HTML TAG BY A SIMPLE ONE (WITHOUT NAMESPACE)
  if ($_ =~ /<html/) {
    print FICH_OUT "<html>";
    $isDone = 1;
  }

  # TITLE LEVELS
  if ($_ =~ /(={1,6})\s(.*)?\s\1/) {
    $title_lvl = length $1;
    print FICH_OUT '<h' . $title_lvl . '>' . $2 . '</h' . $title_lvl . '>'; 
    $isDone = 1;
  }

  # META DATA BLOCK
  if ($_ =~ /\{#/) {
    $insideMeta = 1;
    print FICH_OUT '<meta>'; 
    $isDone = 1;
  }
  
  if ($_ =~ /#}/) {
    $insideMeta = 0;
    print FICH_OUT '</meta>';
    $isDone = 1;
  }
  
  if ($insideMeta == 1)
  {	
    if ($_ =~ /\#([a-z]{1,15})\s([^\#]+)/g)
    {
      print FICH_OUT "<" . $1 . ">" . $2 . "</" . $1 . ">";
    }
    $isDone = 1;
  }
  
  # IMPORTANT BLOCK
  if ($_ =~ /.*\$}/ && !($_ =~ /{\$.*/)) {
    $insideImportant = 0;
    print FICH_OUT '</important>';
    $isDone = 1;
  } 
  
  if ($insideImportant == 1) {
      print FICH_OUT $_;
      $isDone = 1;
    }
  
  if ($_ =~ /{\$.*/) {
    $insideImportant = 1;
    print FICH_OUT '<important>'; 
    $isDone = 1;
  } 
  
  # EXAMPLE BLOCK
  if ($_ =~ /.*-}/ && !($_ =~ /{-.*/)) {
    $insideExemple = 0;
    print FICH_OUT '</exemple>';
    $isDone = 1;
  } 
  
  if ($insideExemple == 1) {
      print FICH_OUT $_;
      $isDone = 1;
    }
  
  if ($_ =~ /{-.*/) {
    $insideExemple = 1;
    print FICH_OUT '<exemple>'; 
    $isDone = 1;
  } 
  
  # DEFINITION BLOCK
  if ($_ =~ /.*:}/ && !($_ =~ /{:.*/)) {
    $insideDefinition = 0;
    print FICH_OUT '</definition>';
    $isDone = 1;
  } 
  
  if ($insideDefinition == 1) {
      print FICH_OUT $_;
      $isDone = 1;
    }
  
  if ($_ =~ /{:.*/) {
    $insideDefinition = 1;
    print FICH_OUT '<definition>'; 
    $isDone = 1;
  } 
  
  # attention BLOCK
  if ($_ =~ /.*!}/ && !($_ =~ /{!.*/)) {
    $insideAttention = 0;
    print FICH_OUT '</attention>';
    $isDone = 1;
  } 
  
  if ($insideAttention == 1) {
      print FICH_OUT $_;
      $isDone = 1;
    }
  
  if ($_ =~ /{!.*/) {
    $insideAttention = 1;
    print FICH_OUT '<attention>'; 
    $isDone = 1;
  } 
  
  if ($isDone == 0) { # if line has been handled without detecting anything, we print it
    print FICH_OUT $_;
  }
  
} # END OF FILE LOOP

close(FICH_IN);
close(FICH_OUT);