-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlanguage.js
475 lines (462 loc) · 24.4 KB
/
language.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
import { Grammar, Tokenizer } from 'earley-parser'
import { AST } from './ast.js'
import SyntacticTypes from './syntactic-types.js'
import { notationStringToArray } from './utilities.js'
import { builtInConcepts } from './built-in-concepts.js'
import { Template } from './template.js'
/**
* The Language class represents one of the languages among which a
* {@link Converter} can convert notation. To add a new language to a
* {@link Converter} instance, just call the constructor of this class, passing
* the {@link Converter} instance. Every Language instance needs a
* {@link Converter} instance to which it belongs, because the purpose of this
* class is to enable conversions among languages, by working together with a
* {@link Converter} instance.
*
* Each instance of this class contains the data passed to its constructor, plus
* a reference to its {@link Converter}, plus a tokenizer and parser for reading
* notation written in the language represented by this instance. After
* creating an instance of this class, you specify the language itself by
* repeated calls to the {@link Language#addNotation addNotation()} function.
* This is necessary in order for the language to have any definition at all,
* before using it to parse text.
*
* Although you can call methods in this class directly to parse text, such as
* {@link Language#parse parse()} and {@link Language#convertTo convertTo()}, it
* is simpler if you instead use the {@link Converter#convert convert()} method
* of the corresponding {@link Converter} instance. Using the {@link Converter}
* will save you from having to deal with intermediate forms like {@link AST
* abstract syntax trees}, but you can create and work with such objects if you
* wish.
*/
export class Language {
/**
* To add a new language to a {@link Converter}, just call this constructor,
* passing the converter object as one of the parameters, and this
* constructor will add this language to that converter.
*
* The default grouping symbols for any newly installed language are `(` and
* `)`. You may wish to override this default for a number of reasons. For
* example, internally, when this class adds the putdown language, it
* specifies the empty array, since putdown uses no grouping symbols. And
* if you were to define a parser for LaTeX, you might want to add the
* symbols `{` and `}` to the list, since they are groupers in LaTeX. Note
* that the array must be of even length, pairs of open and close groupers,
* in that order, as in `[ '(', ')', '{', '}' ]` for LaTeX.
*
* The default linter for any language is the identity function, meaning
* that no cleanup is needed for expressions of that language. If you want
* the {@link Converter#convert convert()} function, upon creating an
* expression in this language, to apply to it any specific formatting
* conventions you would like to see in the output, you can specify a
* linter, which will be run before {@link Converter#convert convert()}
* returns its result. Add such a function only if you see output from
* {@link Converter#convert convert()} that doesn't meet your standards,
* aesthetically or for some functional reason.
*
* For example, when installing putdown as the initial language in the
* constructor for this class, it provides a linter that removes unnecessary
* spaces around parentheses.
*
* @param {String} name - the name of the new language (e.g., "latex")
* @param {Converter} converter - the converter into which to install this
* language
* @param {String[]} groupers - any pairs of grouping symbols used by the
* language (as documented above)
* @param {Function?} linter - a function that cleans up notation in this
* language (as documented above)
* @see {@link Converter#convert convert()}
* @see {@link Converter#language language()}
* @see {@link Converter#isLanguage isLanguage()}
*/
constructor ( name, converter, groupers = [ '(', ')' ], linter = x => x ) {
// Allow clients to pass "null" for groupers:
if ( !groupers ) groupers = [ ]
// Store all parameters:
this.name = name
this.converter = converter
this.groupers = groupers
this.linter = linter
// Also create a tokenizer and grammar and store them:
this.tokenizer = new Tokenizer()
this.tokenizer.addType( /\s/, () => null )
this.grammar = new Grammar()
this.grammar.START = SyntacticTypes.hierarchies[0][0]
// For storing derived concepts' putdown forms:
this.derivedNotation = new Map()
// For each concept in the converter, if it is atomic and has a putdown
// form that is a single regular expression, use that as the default
// notation in the new language; this can be overridden by any later
// call to addNotation(). So we mark it here as the default, so they
// know they can delete it later.
Array.from( converter.concepts.keys() ).forEach( conceptName => {
const concept = converter.concepts.get( conceptName )
if ( SyntacticTypes.isAtomic( concept.parentType ) ) {
if ( concept.putdown instanceof Array ) {
if ( concept.putdown.length != 1 ) return
if ( !( concept.putdown[0] instanceof RegExp ) ) return
}
if ( !( concept.putdown instanceof RegExp ) ) {
const array = notationStringToArray( concept.putdown,
Array.from( Template.defaultVariableNames ) )
if ( array.length != 1 ) return
if ( !( array[0] instanceof RegExp ) ) return
}
this.addNotation( conceptName, concept.putdown )
const rhss = this.rulesFor( conceptName )
rhss[rhss.length-1].putdownDefault = true
}
} )
// Add subtyping rules and grouping rules to the grammar.
// Also, for each atomic type, give it zero rules, to avoid Earley
// throwing an error about the type not existing, if the client doesn't
// choose to include that atomic type in their language.
SyntacticTypes.hierarchies.forEach( hierarchy => {
for ( let i = 0 ; i < hierarchy.length - 1 ; i++ )
this.grammar.addRule( hierarchy[i], hierarchy[i+1] )
if ( groupers.length > 0 && hierarchy[0] == this.grammar.START ) {
const last = hierarchy[hierarchy.length-1]
if ( SyntacticTypes.isAtomic( last ) ) {
for ( let i = 0 ; i < groupers.length - 1 ; i += 2 ) {
const left = groupers[i]
const right = groupers[i+1]
this.addNotation( `Grouped${last}`, `${left} A ${right}` )
}
}
}
const lowest = hierarchy[hierarchy.length-1]
if ( !this.grammar.rules.hasOwnProperty( lowest ) )
this.grammar.rules[lowest] = [ ]
} )
// Register ourselves with our converter
converter.languages.set( name, this )
}
/**
* Add a new notation to this language, for one of its converter's concepts.
* Specify the name of the concept being represented, then the notation
* using a string in which the letters `A`, `B`, and `C` represent the
* first, second, and third arguments, respectively. (You can omit any
* arguments you do not need. For example, you might write `A+B` for
* addition, `-A` for negation, or just `\\bot` for the logical constant
* "false" in LaTeX.)
*
* The options object supports the following fields.
*
* * If you need to use one of the letters `A`, `B`, or `C` in the notation
* itself, or if you need to use more than three parameters in your
* notation (continuing on to `D`, `E`, etc.) then you can use the
* options object to specify the variables in your notation. For
* example, you could use notation `x+y` and then use the options object
* to specify `{ variables : [ 'x', 'y' ] }`. Note that every occurrence
* a variable counts as the variable (except inside another word) even if
* used multiple times. So choose variable names that do not show up
* in the new notation you are introducing.
* * If this notation should be used only for representing the concept in
* this language, but not for parsing from this language into an AST,
* then you can set `writeOnly : true`. This can be useful in two cases.
* 1. If you have multiple notations for the same concept in some
* languages, but not in others. You can map each notation to a
* separate concept, then map all concepts to one notation in the
* smaller language, marking all but one as write-only, thus
* establishing a canonical form. And yet between any two languages
* that support all the notations, translation can preserve the
* notational subtleties.
* 2. If you have some notation that is just a shorthand for a more
* complex notation, you can parse the notation to a concept named
* for that notation, but convert to putdown form in a write-only
* way, expanding the notation to its underlying (compound) meaning.
* Then the converter will not attempt to invert that expansion when
* parsing putdown, but will preserve its expanded meaning.
*
* There are no other options at this time besides those documented above,
* but the options object is available for future expansion.
*
* @param {String} conceptName - the name of the concept represented by this
* new notation
* @param {String} notation - the notation being added
* @param {Object?} options - any additional options, as documented above
*/
addNotation ( conceptName, notation, options = { } ) {
const originalNotation = notation
options = Object.assign( {
variables : Array.from( Template.defaultVariableNames )
}, options )
// ensure concept is valid
const concept = this.converter.concept( conceptName )
if ( !concept )
throw new Error( `Not a valid concept: ${conceptName}` )
// if this is an atomic concept, and the only previous notation it had
// was the putdown default installed at language creation time, then the
// user is now overriding that, so we should delete it.
if ( SyntacticTypes.isAtomic( concept.parentType )
&& this.grammar.rules.hasOwnProperty( conceptName )
&& this.grammar.rules[conceptName].length == 1
&& this.grammar.rules[conceptName][0].putdownDefault )
delete this.grammar.rules[conceptName]
// convert notation to array if needed and extract its tokens
if ( notation instanceof RegExp )
notation = [ notation ]
const notationToPutdown = [ ]
if ( !( notation instanceof Array ) ) {
notation = notationStringToArray( notation, options.variables ).map(
piece => {
const variableIndex = options.variables.indexOf( piece )
if ( variableIndex > -1 ) {
notationToPutdown.push( variableIndex )
return concept.typeSequence[variableIndex]
} else {
return piece
}
}
)
}
const putdownToNotation = notationToPutdown.slice()
notationToPutdown.forEach( ( pi, ni ) => putdownToNotation[pi] = ni )
notation.forEach( piece => {
if ( piece instanceof RegExp ) this.addTokenType( piece )
} )
// add notation to grammar (which can modify the array in-place, so
// it is necessary to do this only after using it to make tokens, above)
let parentType = concept.parentType
// If this notation is write-only, add it to the derived notations list:
if ( options.writeOnly ) {
this.derivedNotation.set( conceptName, notation )
// record in that notation array several data needed for rendering
notation.notationToPutdown = notationToPutdown
notation.putdownToNotation = putdownToNotation
notation.notation = originalNotation
notation.variables = options.variables
// But if it is NOT write-only, add it to the parser:
} else {
// putdown is a special language in which every notation counts as a
// grouper, so everything has high precedence (lowest subtype)
if ( this.name == 'putdown' )
parentType = SyntacticTypes.lowestSubtype( parentType )
this.grammar.addRule( parentType, conceptName )
this.grammar.addRule( conceptName, notation )
const rhss = this.rulesFor( conceptName )
const newRule = rhss[rhss.length - 1]
// record in the new rule RHS several data about how the rule was made
newRule.notationToPutdown = notationToPutdown
newRule.putdownToNotation = putdownToNotation
newRule.notation = originalNotation
newRule.variables = options.variables
}
}
/**
* Treat the given text as an expression in this language and attempt to
* parse it. Return an abstract syntax tree ({@link AST}) on success, or
* `undefined` on failure. Or, if you set the optional second parameter to
* true, it will return an array of all possible parsings, each as an
* {@link AST}.
*
* @param {String} text - the input text to parse
* @param {boolean} ambiguous - if true, return all possible meanings of the
* given text, which will be more than one if the text is ambiguous;
* defaults to false, which returns just one AST or undefined
* @returns {AST|AST[]} - if `ambiguous` is set to false, returns the parsed
* AST, or undefined if parsing failed; if `ambiguous` is set to true,
* returns all parsed ASTs as an array, which may be empty
* @see {@link AST#compact compact()}
*/
parse ( text, ambiguous = false ) {
// Ensure tokenization succeeds, or throw an appropriate error
const tokens = this.tokenizer.tokenize( text )
if ( !tokens ) {
if ( this._debug ) {
console.log( `Tokenizer failed to tokenize "${text}"` )
console.log( this.tokenizer.tokenTypes )
}
return
}
// Find all possible parsings and sort them alphabetically
const all = this.grammar.parse( tokens, {
showDebuggingOutput : this._debug
} )
all.sort( ( a, b ) =>
JSON.stringify( a ).localeCompare( JSON.stringify( b ) ) )
// If they asked for multiple results, return an array (but filter out
// the ones that don't associate correctly).
if ( ambiguous )
return all.map( json => AST.fromJSON( this, json ) )
.filter( ast => ast.associatesCorrectly() )
// They want just one result, so find the first AST that associates
// correctly and return it.
for ( let i = 0 ; i < all.length ; i++ ) {
const ast = AST.fromJSON( this, all[i] )
if ( ast.associatesCorrectly() ) return ast
}
}
/**
* Convert text in this language to text in another language. If the text
* cannot be parsed in this language, then undefined is returned instead.
* Note that this object and `language` must have the same {@link Converter}
* instance associated with them, or this function will throw an error.
*
* @param {String} text - the text in this language to be converter to the
* other language
* @param {Language} language - the destination language
* @param {boolean} ambiguous - passed to the {@link Converter#parse
* parse()} function, and thus determines whether the result of this is a
* string or an array thereof
* @returns {String|String[]} the converted text, if the conversion was
* possible, and undefined otherwise (or an array of strings if
* `ambiguous` is true)
*/
convertTo ( text, language, ambiguous = false ) {
if ( this.converter != language.converter )
throw new Error( 'The two languages do not share a Converter' )
// return one result:
if ( !ambiguous )
return this.parse( text )?.toLanguage( language )
// return multiple results, with duplicates removed, order preserved:
const all = this.parse( text, true )
.map( ast => ast.toLanguage( language ) )
return all.filter( ( item, index ) =>
!all.slice( 0, index ).includes( item ) )
}
/**
* Get all grammar rules for the given concept or syntactic type. The
* result is an array of the right-hand sides of the grammar rules for the
* concept or syntactic type. Each such right-hand side is the array of
* tokens or type names used internally by the parser.
*
* @param {String|AST} target - if this is a string, it must be the name of
* the concept or syntactic type to look up; if it is a leaf AST, then its
* contents as a string are used; if it is a compound AST, then its head
* is used
* @returns {Array} an array of the right-hand sides of the grammar rules
*/
rulesFor ( type ) {
if ( type instanceof AST )
type = type.isCompound() ? type.head().contents : type.contents
const result = this.grammar.rules[type] || [ ]
if ( this.derivedNotation.has( type ) )
result.push( this.derivedNotation.get( type ) )
return result
}
/**
* This static member of the class contains regular expressions for some
* common types of notation. The following regular expressions are
* available to make it easier to define new concepts or notations.
*
* * `oneLetterVariable` - a single letter variable expressed in Roman
* letters (lower-case or upper-case A, B, C, ...)
* * `nonnegativeInteger` - an integer expressed using just the digits 0-9
* * `integer` - same as the previous, but possibly preceded by a `-`
* * `nonnegativeNumber` - a number expressed using the digits 0-9 and a
* decimal point (optional)
* * `number` - same as the previous, but possibly preceded by a `-`
*/
static regularExpressions = {
oneLetterVariable : /[a-zA-Z]/, // can be upgraded later with Greek, etc.
nonnegativeInteger : /\d+/,
integer : /-?\d+/,
nonnegativeNumber : /\.[0-9]+|[0-9]+\.?[0-9]*/,
number : /-?\.[0-9]+|[0-9]+\.?[0-9]*/
}
/**
* Rather than creating an empty language using this class's constructor,
* then adding each notation with a separate function call, you can
* construct an instance and add all the notations in one function call with
* this method.
*
* The format for the JSON data structure passed as the third argument is as
* follows.
*
* * It should have a `"groupers"` field that is an array of strings
* containing the exact same data you would pass as the `groupers`
* argument to the constructor.
* * It should have a `"notations"` field that is an array of objects,
* each object having the fields `"concept"`, `"notation"`, and
* `"options"`, which correspond directly to the three parameters of the
* {@link Language#addNotation addNotation()} function.
*
* To see an example of such a data structure, examine the contents of the
* file {@link latex-notation.js} in this repository.
*
* @param {string} name - the name of the language, just as in this class's
* constructor
* @param {Converter} converter - a {@link Converter} instance, just as in
* this class's constructor
* @param {Object} json - the JSON representation of the language, as
* described above
* @param {boolean} addConceptsAlso - if true, before constructing the
* Language, examine all built-in concepts mentioned in any of its
* notations, and add them to the `converter` using
* {@link Converter#addBuiltIns addBuiltIns()}.
*/
static fromJSON ( name, converter, json, addConceptsAlso = true ) {
if ( addConceptsAlso ) {
// Add all concepts explicitly mentioned in the language
const conceptNames = json.notations.filter( entry => entry.concept )
.map( entry => entry.concept )
converter.addBuiltIns( [ ...new Set( conceptNames ) ] )
// But every language also contains the concepts defined by regular
// expressions, so we have to add them too
const regexpNames = builtInConcepts.filter(
entry => entry.regularExpression ).map( entry => entry.name )
converter.addBuiltIns( regexpNames )
}
const result = new Language( name, converter, json.groupers )
json.notations.forEach( entry => result.addNotation(
entry.concept, entry.notation, entry.options ) )
return result
}
// Internal use only
// Does this language have a token in its tokenizer that is equal to the
// given regular expression?
hasTokenType ( regexp ) {
return this.tokenizer.tokenTypes.find( tokenType =>
tokenType.regexp.source == `^(?:${regexp.source})` )
}
// Internal use only
// Does this language have a token in its tokenizer that is a regular
// expression matching the given string? (Note that we add a $ to test for
// a full-string match.)
hasTokenMatching ( string ) {
return this.tokenizer.tokenTypes.find( tokenType =>
new RegExp( tokenType.regexp.source + '$' ).test( string ) )
}
// Internal use only
// Add a given regular expression to our tokenizer, iff such a token is not
// already present, and then sort the list of tokens in a way that
// prioritizes built-in regular expressions lower (because they are infinite
// sets) and that prioritizes other regular expressions by reverse
// alphabetical order (so that subwords can't be prioritized over full words
// that contain them).
// Note that if the regexp has a property called "originalString", which
// will have been put there by notationStringToArray(), then the regexp was
// designed to match exactly one string. In that case, if any existing
// token matches that same string, then the new token will not be added.
// This prevents bugs like the following: If you define function inverse to
// be f^{-1}, then the "1" doesn't become its own token, borking ordinary
// decimal numbers like 1.5. We make (in notationStringToArray()) an
// exception for this for one-regexp notations, which clearly intend for
// themselves to be tokenized as-is.
addTokenType ( regexp ) {
if ( this.hasTokenType( regexp ) ) return
if ( regexp.hasOwnProperty( 'originalString' )
&& this.hasTokenMatching( regexp.originalString ) ) return
this.tokenizer.addType( regexp )
const lowPriority = Object.keys( Language.regularExpressions )
.map( key => `^(?:${Language.regularExpressions[key].source})` )
const shouldBeLater = re => lowPriority.includes( re.source ) ? 1 : 0
const tokenSort = ( a, b ) => a == '' && b == '' ? 0 :
a.length == 0 ? 1 : // sort Xsuffix < X
b.length == 0 ? -1 : // sort Xsuffix < X
a[0] < b[0] ? -1 :
a[0] > b[0] ? 1 :
tokenSort( a.substring( 1 ), b.substring( 1 ) )
this.tokenizer.tokenTypes.sort( ( a, b ) => {
const aLater = shouldBeLater( a.regexp )
const bLater = shouldBeLater( b.regexp )
if ( aLater != bLater ) return aLater - bLater
a = a.regexp.source
b = b.regexp.source
a = a.substring( 4, a.length - 1 ) // turn ^(?:foo) into foo
b = b.substring( 4, b.length - 1 ) // same
return tokenSort( a, b )
} )
}
}