Photon 1.0.0
Loading...
Searching...
No Matches
css_tokenizer.h
Go to the documentation of this file.
1#ifndef LH_CSS_TOKENIZER_H
2#define LH_CSS_TOKENIZER_H
3
4namespace litehtml
5{
6
7// https://www.w3.org/TR/css-syntax-3/#tokenization
8// :;,()[]{} token or delim token: type == this char
9// EOF token: type == EOF (-1)
10// type may be 0 to indicate an error, see at()
12{
14
15 // Giving EOF and some chars explicit names to facilitate debugging and to get rid of warning C4063: case '41' is not a valid value for switch of enum 'litehtml::css_token_type'
16 _EOF = EOF,
23 COLON = ':',
24 SEMICOLON = ';',
25 COMMA = ',',
26 BANG = '!',
27 DOT = '.',
28 AMPERSAND = '&',
29
30 IDENT = -20, // do not collide with any unicode chars
31 FUNCTION, // calc(
32 AT_KEYWORD, // @media
33 HASH, // #foo
34 STRING, // "xxx" or 'xxx'
36 URL, // url(x.com) - but not url("x.com"), which is function + string + ')'
38 NUMBER, // 25
39 PERCENTAGE, // 25%
40 DIMENSION, // 25px
41 CDO, // <!--
42 CDC, // -->
43
44 // https://www.w3.org/tr/css-syntax-3/#component-value
46 // simple block:
47 CURLY_BLOCK = -100 - '{',
48 ROUND_BLOCK = -100 - '(',
49 SQUARE_BLOCK = -100 - '['
50};
51
57
63
64// css_token: CSS token or component value ("fat" token)
65// Tokens exist in uncomponentized form only a short time after tokenization, most of the time they are "fat".
66// All functions in css_parser work regardless of whether tokens are fat or not, as per standard.
67// All functions outside of css_parser that parse media queries, selectors, property values assume tokens are componentized.
69{
71 float number = 0, css_number_type number_type = css_number_integer, string str = "")
72 : type(type), str(str), n{number, number_type}
73 {
74 if (is_component_value()) new(&value) vector<css_token>;
75 }
76
78 : type(type), str(str), n()
79 {
80 if (is_component_value()) new(&value) vector<css_token>;
81 }
82
83 css_token(const css_token& token) : type(token.type), str(token.str), repr(token.repr)
84 {
85 switch (type)
86 {
87 case HASH:
88 hash_type = token.hash_type;
89 break;
90
91 case NUMBER:
92 case PERCENTAGE:
93 case DIMENSION:
94 n = token.n;
95 break;
96
97 case CV_FUNCTION:
98 case CURLY_BLOCK:
99 case ROUND_BLOCK:
100 case SQUARE_BLOCK:
101 new(&value) vector(token.value);
102 break;
103
104 default:;
105 }
106 }
107
109 {
110 this->~css_token();
111 new(this) css_token(token);
112 return *this;
113 }
114
116 {
117 str.~string();
118 if (is_component_value()) value.~vector();
119 }
120
122 {
123 return type <= CV_FUNCTION;
124 }
125
126 string ident() const;
127 string get_repr(bool insert_spaces = false) const;
128
129 union {
131 int ch; // used for <delim-token> or :;,()[]{}
132 };
133 union {
134 string str; // STRING, URL
135 string name; // HASH, IDENT, AT_KEYWORD, FUNCTION, CV_FUNCTION
136 string unit; // DIMENSION
137 };
138 struct number {
139 float number; // NUMBER, PERCENTAGE, DIMENSION
140 css_number_type number_type; // NUMBER, DIMENSION
141 };
142 union {
145 vector<css_token> value; // CV_FUNCTION, XXX_BLOCK
146 };
147
148 string repr; // https://www.w3.org/TR/css-syntax-3/#representation
149};
150
151using css_token_vector = vector<css_token>;
152string get_repr(const css_token_vector& tokens, int index = 0, int count = -1, bool insert_spaces = false);
153
155{
156public:
157 css_tokenizer(const string& input) : str(input), index(0), current_char(0) {}
158
160
161private:
162 // Input stream. Valid UTF-8; no NUL bytes. https://www.w3.org/TR/css-syntax-3/#input-stream
163 string str;
164
165 // Index of the next input char. https://www.w3.org/TR/css-syntax-3/#next-input-code-point
166 int index;
167
168 // https://www.w3.org/TR/css-syntax-3/#current-input-code-point
169 // This is needed to handle the situation when unconsume_char is called when index == str.size().
170 // We need to distinguish between the situation when we just read the last char and
171 // the situation when we already have been at the end and just read NUL.
172 // If we don't do this tokenizer will loop forever on input "a".
174
175private:
176 static bool is_whitespace(int ch);
177 static bool is_non_printable_code_point(int ch);
178 static bool is_ident_start_code_point(int ch);
179 static bool is_ident_code_point(int ch);
180
181 struct three_chars { int _1, _2, _3; };
182
183 int consume_char();
184 void unconsume_char();
185 int peek_char();
187
188 void consume_comments();
190 css_token consume_string_token(int ending_code_point);
191
192 static bool would_start_ident_sequence(three_chars chars);
193 string consume_ident_sequence();
194
195 static bool would_start_a_number(int x, int y, int z);
196 static double convert_string_to_number(const string& str);
197 double consume_number(css_number_type& number_type);
199
202
205};
206
207void css_parse_error(string msg);
208inline css_token_vector tokenize(const string& str)
209{
210 return css_tokenizer(str).tokenize();
211}
212
213} // namespace litehtml
214
215#endif // LH_CSS_TOKENIZER_H
Definition css_tokenizer.h:155
int peek_char()
Definition css_tokenizer.cpp:96
css_token_vector tokenize()
Definition css_tokenizer.cpp:711
static bool is_whitespace(int ch)
Definition css_tokenizer.cpp:54
css_tokenizer(const string &input)
Definition css_tokenizer.h:157
void consume_comments()
Definition css_tokenizer.cpp:114
static bool is_non_printable_code_point(int ch)
Definition css_tokenizer.cpp:60
static bool is_ident_code_point(int ch)
Definition css_tokenizer.cpp:70
css_token consume_numeric_token()
Definition css_tokenizer.cpp:388
int current_char
Definition css_tokenizer.h:173
int consume_char()
Definition css_tokenizer.cpp:77
int index
Definition css_tokenizer.h:166
double consume_number(css_number_type &number_type)
Definition css_tokenizer.cpp:333
css_token consume_string_token(int ending_code_point)
Definition css_tokenizer.cpp:175
css_token consume_token()
Definition css_tokenizer.cpp:559
static double convert_string_to_number(const string &str)
Definition css_tokenizer.cpp:288
css_token consume_ident_like_token()
Definition css_tokenizer.cpp:520
static bool would_start_ident_sequence(three_chars chars)
Definition css_tokenizer.cpp:219
void unconsume_char()
Definition css_tokenizer.cpp:86
string str
Definition css_tokenizer.h:163
static bool is_ident_start_code_point(int ch)
Definition css_tokenizer.cpp:65
void consume_remnants_of_bad_url()
Definition css_tokenizer.cpp:421
css_token consume_url_token()
Definition css_tokenizer.cpp:440
int consume_escaped_code_point()
Definition css_tokenizer.cpp:139
static bool would_start_a_number(int x, int y, int z)
Definition css_tokenizer.cpp:269
three_chars peek_chars()
Definition css_tokenizer.cpp:102
string consume_ident_sequence()
Definition css_tokenizer.cpp:241
Definition core.h:1598
constexpr auto count() -> size_t
Definition core.h:1538
type
Definition core.h:681
Definition background.h:12
vector< css_token > css_token_vector
Definition css_tokenizer.h:151
css_hash_type
Definition css_tokenizer.h:59
@ css_hash_unrestricted
Definition css_tokenizer.h:60
@ css_hash_id
Definition css_tokenizer.h:61
css_token_type
Definition css_tokenizer.h:12
@ WHITESPACE
Definition css_tokenizer.h:13
@ SEMICOLON
Definition css_tokenizer.h:24
@ RIGHT_BRACE
Definition css_tokenizer.h:18
@ AT_KEYWORD
Definition css_tokenizer.h:32
@ LEFT_PAREN
Definition css_tokenizer.h:21
@ STRING
Definition css_tokenizer.h:34
@ DOT
Definition css_tokenizer.h:27
@ LEFT_BRACE
Definition css_tokenizer.h:17
@ COLON
Definition css_tokenizer.h:23
@ IDENT
Definition css_tokenizer.h:30
@ DIMENSION
Definition css_tokenizer.h:40
@ CDO
Definition css_tokenizer.h:41
@ BANG
Definition css_tokenizer.h:26
@ FUNCTION
Definition css_tokenizer.h:31
@ PERCENTAGE
Definition css_tokenizer.h:39
@ AMPERSAND
Definition css_tokenizer.h:28
@ LEFT_BRACKET
Definition css_tokenizer.h:19
@ COMMA
Definition css_tokenizer.h:25
@ SQUARE_BLOCK
Definition css_tokenizer.h:49
@ _EOF
Definition css_tokenizer.h:16
@ CURLY_BLOCK
Definition css_tokenizer.h:47
@ RIGHT_BRACKET
Definition css_tokenizer.h:20
@ BAD_URL
Definition css_tokenizer.h:37
@ BAD_STRING
Definition css_tokenizer.h:35
@ ROUND_BLOCK
Definition css_tokenizer.h:48
@ CDC
Definition css_tokenizer.h:42
@ NUMBER
Definition css_tokenizer.h:38
@ URL
Definition css_tokenizer.h:36
@ CV_FUNCTION
Definition css_tokenizer.h:45
@ HASH
Definition css_tokenizer.h:33
@ RIGHT_PAREN
Definition css_tokenizer.h:22
css_number_type
Definition css_tokenizer.h:53
@ css_number_integer
Definition css_tokenizer.h:54
@ css_number_number
Definition css_tokenizer.h:55
string get_repr(const css_token_vector &tokens, int index=0, int count=-1, bool insert_spaces=false)
Definition css_tokenizer.cpp:40
void css_parse_error(string msg)
Definition css_tokenizer.cpp:7
css_token_vector tokenize(const string &str)
Definition css_tokenizer.h:208
Definition css_tokenizer.h:138
css_number_type number_type
Definition css_tokenizer.h:140
float number
Definition css_tokenizer.h:139
Definition css_tokenizer.h:69
string unit
Definition css_tokenizer.h:136
string str
Definition css_tokenizer.h:134
css_token(css_token_type type, const string &str)
Definition css_tokenizer.h:77
css_token_type type
Definition css_tokenizer.h:130
string ident() const
Definition css_tokenizer.cpp:12
string repr
Definition css_tokenizer.h:148
string get_repr(bool insert_spaces=false) const
Definition css_tokenizer.cpp:27
css_hash_type hash_type
Definition css_tokenizer.h:143
bool is_component_value() const
Definition css_tokenizer.h:121
number n
Definition css_tokenizer.h:144
css_token(const css_token &token)
Definition css_tokenizer.h:83
css_token(css_token_type type=css_token_type(), float number=0, css_number_type number_type=css_number_integer, string str="")
Definition css_tokenizer.h:70
~css_token()
Definition css_tokenizer.h:115
css_token & operator=(const css_token &token)
Definition css_tokenizer.h:108
int ch
Definition css_tokenizer.h:131
string name
Definition css_tokenizer.h:135
vector< css_token > value
Definition css_tokenizer.h:145
Definition css_tokenizer.h:181
int _3
Definition css_tokenizer.h:181
int _2
Definition css_tokenizer.h:181
int _1
Definition css_tokenizer.h:181
annotation input
Definition tag_strings.h:114