A set of C++ classes to implement a Reverse Descent Parser.

modified:10 September 2024 04:20:41.

Purpose:

This is a series of C++ classes to implement a Reverse Descent Parser.
It is meant as test code, therefore it is quite verbose in what it prints.

The List class is a double linked list to make handling both of the Token list and the RDP list easier.

There are 3 stages to parsing the input:

  1. Parse the formula into tokens
  2. Convert the token list into a Reverse Polish Notation list
  3. Traverse the Reverse Polish Notation list to get the result

NOTE: This code is incomplete. I have removed the debug code for understandability. For the complete code see this archive.

Parsing the formula into tokens:

The parsing is done with a Finite State Machine. It takes the formula string and extracts the individual tokens (e.g. '+') onto a list.

The states are all created as static pointers.

This is the struct for all the nodes on the Token list and the RPN list.
Each node can be just a type, or have ether a double precision float, 64bit integer, or a string array associated with it.

typedef struct {
	struct Node node;		// a node for the double linked list.
	int type;
	union {
		double number;
		long long integer;
		char *str;
	};
} t_node;

The virtual base class

class StateStart : public StateBase {
	protected:
		string name = "Start";
	public:
		void enter (Tokenize *fsm) override {};
		void work (Tokenize *fsm, const char *str) override;
		void exit (Tokenize *fsm) override {};
};

This is the class definition for the FSM. On a state change it calls the exit() function on the previous state, then changes the state and calls the enter() on the new class.

class Tokenize {
	private:
		string name = "Tokenize";		// for debug
		void reset (void) { flush_list(); formula = ""; used = 0; } 
		void flush_list (void) {
			t_node *node;
			while (node = (t_node *)list->rem_head()) {
				free (node);
			}
		}
	protected:
		StateBase *state;
	public:
		List *list;
		string formula;
		int used = 0;
		Tokenize(List *list) { 
			create_all_states(); 
			this->list = list; 
			this->state = state_start; 
		}
		~Tokenize() { }
		void set_formula (string str) { reset(); formula = str; }
		void set_state (StateBase *n) { 
			if (state) state->exit(this);
			state = n;
			if (state) state->enter(this);
		}
		bool do_work (void) { 
			while (formula.length() > 0) {
				state->work(this, &formula.c_str()[used]);
			} 
			return formula.length() == 0;
		};
};

For each state it is subclassed from the StateBase virtual class. For most states the work and the exit state functions are overrided.

For this implementation there are

Convert the token list into a RPN list:

This takes the nodes on the token list, and re-orders those tokens onto the RPN list so they can be evaluated easier.
Using a recursive function chain to work out what to do.

The Reverse Descent Parser class.

class Parser {
	private:
	t_node *level_add_plus (t_node *);
	t_node *level_mult_div (t_node *);
	t_node *level_bracket (t_node *);
	t_node *level_push (t_node *);
	protected:
	public:
	List *token_list;
	List *rdp_list;
	Parser(List *token_list, List *rdp_list) { this->token_list = token_list; this->rdp_list = rdp_list; }
	Parser() {}
	~Parser() {}
	int parse (void);
	void set_token_list (List *list) { token_list = list; } ;
	void set_rdp_list (List *list) { rdp_list = list; } ;
};

The main parse function, this sets up for the parser chain.

int Parser::parse (void) {
	t_node *node = (t_node *) token_list->rem_head();
	if (node) {
		spaces = "";
		node = LEVEL_START (node);
	}
	return 0;			// 0 = success
}

This is the first in the function chain. It is the lowest in the priority in the order of precedence, and calls higher level_mult_div().

t_node *Parser::level_add_plus (t_node *node) {
	node = level_mult_div (node);
	while (node && (node->type == NT_ADD || node->type == NT_SUB)) {
		t_node *current = node;
		node = (t_node *) token_list->rem_head();
		node = level_mult_div (node);
		rdp_list->add_head (¤t->node);
	}
	return node;
}

This calls the next higher function level_bracket().

t_node *Parser::level_mult_div (t_node *node) {
	node = level_bracket (node);
	while (node && (node->type == NT_MULT || node->type == NT_DIV)) {
		t_node *current = node;
		node = (t_node *) token_list->rem_head();
		node = level_bracket (node);
		rdp_list->add_head (¤t->node);
	}
	spaces = spaces.substr(2);
	return node;
}

This calls the next higher function level_push().

t_node *Parser::level_bracket (t_node *node) {
	if (node && (node->type == NT_OPEN)) {
		free (node);												// not interested in the OPEN token
		node = LEVEL_START ((t_node *) token_list->rem_head());			// start at the top of the chain again with the next token
		if (node->type == NT_CLOSE) {
			free (node);
		}
		node = (t_node *)token_list->rem_head ();						// get the next token
	}
	else node = level_push(node);
	return node;
}

This is the last in the chain and puts the token on list.

t_node *Parser::level_push (t_node *node) {		/* bottom of the chain */
	rdp_list->add_head (&node->node);
	node = (t_node *) token_list->rem_head();
	return node;
}

Traverse the RPN list to get the answer:

This takes the RPN list and applies the mathimatical functions to work out the answer.

The class for RPN parser.

class Evaluate {
	private:
	List *list;
	t_node * create_number_node (double number);
	t_node * create_integer_node (long long integer);
	protected:
	public:
	Evaluate(List *list) { this->list = list; }
	t_node * evaluate (void);
};

This is a recursive functon to work though the RPN list evaluating the nodes as the are removed from the head of the list.
It removes a node from the list, and according to its type, it does the appropriate function.

t_node * Evaluate::evaluate (void) {
	t_node *node = (t_node *) list->rem_head();
	switch (node->type) {
		case NT_ADD: {
			t_node *node2 = evaluate();
			t_node *node1 = evaluate();
			t_node *result;
			if (node1->type == NT_INT && node2->type == NT_INT) {
				result = create_integer_node (node1->integer + node2->integer);
			}
			else {
				result = create_number_node (to_dbl(node1) + to_dbl(node2));
			}
			free (node1);
			free (node2);
			free (node);
			node = result;
			break;
		}
		case NT_SUB: {
			t_node *node2 = evaluate();
			t_node *node1 = evaluate();
			t_node *result;
			if (node1->type == NT_INT && node2->type == NT_INT) {
				 result = create_integer_node (node1->integer - node2->integer);
			}
			else {
				result = create_number_node (to_dbl(node1) - to_dbl(node2));
			}
			free (node1);
			free (node2);
			free (node);
			node = result;
			break;
		}
		case NT_MULT: {
			t_node *node2 = evaluate();
			t_node *node1 = evaluate();
			t_node *result;
			if (node1->type == NT_INT && node2->type == NT_INT) {
				 result = create_integer_node (node1->integer * node2->integer);
			}
			else {
				result = create_number_node (to_dbl(node1) - to_dbl(node2));
			}
			free (node1);
			free (node2);
			free (node);
			node = result;
			break;
		}
		case NT_DIV: {
			t_node *node2 = evaluate();
			t_node *node1 = evaluate();
			t_node *result;
			if (node1->type == NT_INT && node2->type == NT_INT) {
				 result = create_integer_node (node1->integer / node2->integer);
			}
			else {
				result = create_number_node (to_dbl(node1) / to_dbl(node2));
			}
			free (node1);
			free (node2);
			free (node);
			node = result;
			break;
		}
	}
	return node;
}

Creates an integer node that is returned by one call to the evaluate function.

t_node *Evaluate::create_integer_node (long long integer) {
	t_node *node = (t_node *) malloc (sizeof(t_node));
	node->type = NT_INT;
	node->integer = integer;
	return node;
}

Creates a floating point node that is returned by one call to the evaluate function.

t_node *Evaluate::create_number_node (double number) {
	t_node *node = (t_node *) malloc (sizeof(t_node));
	node->type = NT_FLOAT;
	node->number = number;
	return node;
}