src/tt.input.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

// References in source input
// ============================================================================

// The references found in the source input is stored as an array of strings
// in the global refs variable: -> declarations

char **refs; /* references */
int refs_c; /* count */
int refs_s; /* size (number of elements allocated for) */

// It is allocated at the beginning of the execution to contain an array of
// ten strings. The refs_s variable keeps track of the amount of allocated 
// space, while ref_c holds the number of actual elements: -> main.globals

  refs_c = 0;
  refs_s = 10;
  refs = malloc(refs_s * sizeof(char *));
  if (refs == NULL) err(1, "malloc");


// Insertions in source input
// ============================================================================

// tt represents every insertion as an array of strings, where each string
// corresponds to a line to be inserted. All insertions are stored in the
// global ins array: -> declarations

char ***ins; /* insertions */

// The position of each insertion in the ins array is always equal to the
// position of the corresponding reference in the refs array -- to find what
// lines should be inserted at destination X, one must find the value P such
// that refs[P] is equal to X. Then, the corresponding insertion will be equal
// to ins[P].

// In other words, the ins array should always be of the same length as refs.
// As such, the refs_s and refs_c variables are used for ins as well. The ins
// array is allocated to hold the same number of elements as refs. Furthermore,
// its elements are set to NULL, signifying the absence of any insertion at
// that index: -> main.globals

  ins = malloc(refs_s * sizeof(char **));
  if (ins == NULL) err(1, "malloc");
  for (i = 0; i < refs_s; i++)
    ins[i] = NULL;

// -> main.declarations

int i;


// Parsing standard input
// ============================================================================

// Text is read from the standard input, line by line, into a line variable.
// Two additional variables, line_s and line_l, keep track of the amount of
// allocated space and the actual number of characters in the string,
// respectively: -> main.declarations

  char *line;
  int line_l; /* length */
  int line_s; /* size (number of characters allocated for) */

// It initially is allocated to hold 100 characters: -> main.input

  line_l = 0;
  line_s = 100;
  line = malloc(1 + line_s * sizeof(char));
  if (line == NULL) err(1, "malloc");

// Lines are read character by character until end of file. First, the read
// character is assigned to the variable b. When it is certain that it is not
// EOF, then it is assigned to the variable c: -> main.input

  while ((b = getchar()) != EOF) {  
    c = b;

// -> main.declarations

  int b;
  int c;

// First of all, tt ignores any carriage returns: -> main.input

  if (c == '\r') continue;

// On Windows, where carriage returns are used, they will automatically be
// removed anyway. On systems that don't use carriage returns, they might not
// be stripped from the input, which is why tt ignores them.

// Otherwise, on every iteration, tt checks whether the read character is a
// newline. If not, the character is added to the line variable, which is 
// re-allocated if necessary. The line_l, keeping track of the line's length,
// is incremented as well: -> main.input

    if (c != '\n') {
      if (line_l + 1 > line_s) {
        line_s += 20;
        tmp = realloc(line, 1 + line_s * sizeof(char));
        if (tmp == NULL) err(1, "malloc");
        line = tmp;
      }
      line[line_l++] = c;
      continue;
    }

// The tmp variable used in the re-allocation has a type which is identical to
// that of the line variable: -> main.declarations

  char *tmp;

// If the read character is a newline, then the program "finishes" the line,
// adding a final NULL character and resetting line_l: -> main.input

finish:
    line[line_l] = '\0';
    line_l = 0;

// Before parsing the line, we make sure to skip it if it is empty and
// following a non-code line: -> main.input

    if (strlen(code_prefix) == 0 && !wascode && strcmp(line, "") == 0) {
      continue;
    }

// This is only desirable if CODE_PREFIX is empty, because then, there is no
// way for the writer of the source input to, for appearance's sake, leave an
// empty line between non-code lines and code lines; any empty line will
// will inevitably be interpreted as a code line. The code above circumvents
// this.

// This aesthetical nicety requires the program to keep track of whether the
// previous line was a code line or not: -> main.declarations

  bool wascode = false;

// Now, it is time to check whether the read line is a code line (an insertion)
// or a documentation line (containing a reference): -> main.input
    
    if (!insertion(line)) reference(line);
  }

// The insertion and reference functions modify the ins and refs variables
// according to the contents of the line.

// Finally, after the loop is finished -- meaning that EOF has been reached --
// we must ensure that the final character was not a newline; otherwise, the
// final line of source input has not been processed, as lines are processed
// only when the terminated newline is encountered.

// Thus, if the final character was a newline, tt goes back and finishes the
// final line: -> main.input

  if (c != '\n') { c = '\n'; goto finish; }


// Identifying and processing documentation lines containing references
// ============================================================================

// The reference function is responsible for processing references in source
// input lines: -> declarations

void reference(char *line);


// Parsing the line
// ----------------------------------------------------------------------------

// Documentation lines are formatted as follows: ->

documentation line ::= DOC_PREFIX anything [reference]

reference ::= "->" [whitespace] identifier [whitespace]
identifier ::= not whitespace

// In order to identify whether a given line actually is a documentation line
// containing a reference, the line variable is aliased to ln, which will be
// modified instead of line: -> reference.declarations

  char *ln = line;

// First, we ensure the line begins with the doc_prefix: -> reference.parse

  if (strncmp(ln, doc_prefix, strlen(doc_prefix)) != 0) return;

// Then, we ensure that a hyphen is present: -> reference.parse

hyphen:
  if (*ln == '\0') return;
  else if (*ln == '-') { ln++; goto lessthan; }
  else { ln++; goto hyphen; }

// After finding the hyphen, we check whether a less-then sign follows it.
// If not, we keep looking for another hyphen. -> reference.parse

lessthan:
  if (*ln != '>') goto hyphen;
  else ln++;

// After finding a less-then sign following a hyphen (->), we ignore all
// whitespace, if there is any. If the end of the line has been reached, or is
// reached, by this point, then it will be interpreted as an empty reference,
// resetting the current reference (meaning that subsequent code lines will not
// be attached to any reference): -> reference.parse

space:
  if (isspace(*ln)) { ln++; goto space; }
  if (*ln == '\0') { ref = ""; return; }

// Now, a valid reference should be a string of non-space characters,
// followed optionally by whitespace, but not anything other than whitespace:
// -> reference.parse

  for (i = 0; i < strlen(ln); i++)
    if (isspace(ln[i])) {
      for (j = i; j < strlen(ln); j++)
        if (!isspace(ln[j])) return;
      break;
    }

// -> reference.declarations

  int i;
  int j;

// After the loop above, i will be set to the index of the first encountered
// space or the end of the line. Any trailing whitespace should be ignored:
// -> reference.parse

  ln[i] = '\0';


// Adding the reference
// ----------------------------------------------------------------------------

// At this point, we have found a valid reference, which should now be added to
// the global refs array.

// First, however, it should be mentioned that reference identifiers have a
// maximum length of 80 characters: -> definitions

#define REFMAX 80

// Thus, any reference identifier longer than REFMAX is truncated, with a
// warning printed to the standard error stream: -> reference.add
  
  if (strlen(ln) > REFMAX) {
    fprintf(stderr, "Warning: Truncating identifier exceeding %d characters\n",
      REFMAX);
    ln[REFMAX] = '\0';
  }

// It should also be mentioned that the current reference is always stored in a
// global variable, from which the code(char *) function knows with which
// reference to associate each code line: -> declarations

char *ref;

// It is allocated in the beginning of the program's execution: -> main.globals

  ref = malloc(1 + REFMAX * sizeof(char));
  if (ref == NULL) err(1, "malloc");

// It is freed before the output section of the program, at which point it is
// no longer needed: -> main.output

  free(ref);

// The variable is set by our reference function: -> reference.add

  sprintf(ref, "%s", ln); /* set current reference */
  ref[strlen(ln)] = '\0';

// Now remains the work of adding the reference to the global refs variable --
// unless it already exists in refs: -> reference.add

  for (i = 0; i < refs_c; i++)
    if (strcmp(refs[i], ref) == 0) return;

// If the reference truly is new, we notify the user: -> reference.add

  fprintf(stderr, "New reference: %s\n", ref);

// Before adding the new reference to refs, we re-allocate refs (and therefore
// also ins, which should always be as large as refs), if needed:
// -> reference.add

  if (++refs_c > refs_s) {
    refs_s += 10;
    tmp = realloc(refs, refs_s * sizeof(char *));
    if (tmp == NULL) err(1, "malloc");
    refs = tmp;
    tmp2 = realloc(ins, refs_s * sizeof(char *));
    if (tmp2 == NULL) err(1, "malloc");
    ins = tmp2;
    for (i = refs_s - 10; i < refs_s; i++) /* TODO: is this right? */
      ins[i] = NULL;
  }

// -> reference.declarations

  char **tmp;
  char ***tmp2;

// Notice that the code above also increases the refs_c count. Now, everything
// else is done, and the reference is ready to be added: -> reference.add

  refs[refs_c-1] = malloc(1 + REFMAX * sizeof(char));
  sprintf(refs[refs_c-1], "%s", ref);


// Identifying and processing code lines
// ============================================================================

// The insertion function is responsible for processing code lines:
// -> declarations

bool insertion(char *line);

// It returns true if the given line is a code line (i.e., an insertion).


// Parsing the code line
// ----------------------------------------------------------------------------

// First of all, if there is no current reference, the insertion should be
// ignored: -> insertion.parse

  if (ref[0] == '\0') return false;

// If there is a CODE_PREFIX, we ensure that the line begins with it.
// Likewise, if there is a DOC_PREFIX, we ensure that the line does not
// begin with it: -> insertion.parse

  if (strlen(code_prefix) > 0)
    if (strncmp(line, code_prefix, strlen(code_prefix)) != 0) return false;
  if (strlen(doc_prefix) > 0)
    if (strncmp(line, doc_prefix, strlen(doc_prefix)) == 0) return false;

// As you can see, the DOC_PREFIX is given precedence over the CODE_PREFIX.


// Adding the code line to the insertions
// ----------------------------------------------------------------------------

// Now that we know the line contains an insertion, we must find the index
// of the current reference in the refs array: -> insertion.add

  for (i = 0; i < refs_c; i++)
    if (strcmp(refs[i], ref) == 0) break;

// -> insertion.declarations

  int i;

// Our goal is to add the insertion to the corresponding position in the ins
// array. If there is no insertion at that position, the value will be NULL:
// -> insertion.add

  if (ins[i] == NULL) {
    ins[i] = malloc(1 + 1 * sizeof(char *));
    if (ins[i] == NULL) err(1, "malloc");
    len = 0;
  }

// If ins[i] is not NULL, then it already contains some number of insertion
// strings, terminated by a final NULL value. In order to allocate memory
// for the new insertion, we find the position of the final NULL value,
// corresponding to the length of the ins[i] array: -> insertion.add

  else {
    for (len = 0; ins[i][len] != NULL; len++) ;
    tmp = realloc(ins[i], 1 + (len + 1) * sizeof(char *));
    if (tmp == NULL) err(1, "malloc");
    ins[i] = tmp;
  }

// -> insertion.declarations

  char **tmp;
  int len;

// Now remains adding the insertion to ins[i]. First, we mark the new final
// position: -> insertion.add

  ins[i][len + 1] = NULL;

// Then, we allocate memory for the string: -> insertion.add

  ins[i][len] = malloc(1 + strlen(line) * sizeof(char));
  if (ins[i][len] == NULL) err(1, "malloc");

// Finally, we copy the string, returning true, signifying that the line
// processed indeed was a code line: -> insertion.add

  strncpy(ins[i][len], line + strlen(code_prefix),
    strlen(line) - strlen(code_prefix));
  ins[i][len][strlen(line) - strlen(code_prefix)] = '\0';
  return true;

// Notice also that we make sure to skip the CODE_PREFIX.