Initial revision
This commit is contained in:
284
doc/ceg/proposal.tr
Normal file
284
doc/ceg/proposal.tr
Normal file
@@ -0,0 +1,284 @@
|
||||
.TL
|
||||
|
||||
Code Expander
|
||||
.br
|
||||
(proposal)
|
||||
|
||||
.SH
|
||||
Introduction
|
||||
.LP
|
||||
The \fBcode expander\fR, \fBce\fR, is a program that translates EM-code to
|
||||
objectcode. The main goal is to translate very fast. \fBce\fR is an instance
|
||||
of the EM_CODE(3L)-interface. During execution of \fBce\fR, \fBce\fR will build
|
||||
in core a machine independent objectfile ( NEW A.OUT(5L)). With \fBcv\fR or
|
||||
with routines supplied by the user the machine independent objectcode will
|
||||
be converted to a machine dependent object code. \fBce\fR needs
|
||||
information about the targetmachine (e.g. the opcode's). We divide the
|
||||
information into two parts:
|
||||
.IP
|
||||
- The description in assembly instructions of EM-code instructions.
|
||||
.IP
|
||||
- The description in objectcode of assembly instructions.
|
||||
.LP
|
||||
With these two tables we can make a \fBcode expander generator\fR which
|
||||
generates a \fBce\fR. It is possible to put the information in one table
|
||||
but that will probably introduce (propable) more bugs in the table. So we
|
||||
divide and conquer. With this approach it is also possible to generate
|
||||
assembly code ( rather yhan objectcode), wich is useful for debugging.
|
||||
There is of course a link between the two tables, the link
|
||||
consist of a restriction on the assembly format. Every assembly
|
||||
instruction must have the following format:
|
||||
.sp
|
||||
INSTR ::= LABEL : MNEMONIC [ OPERAND ( "," OPERAND)* ]
|
||||
.sp
|
||||
.LP
|
||||
\fBCeg\fR uses the following algorithm:
|
||||
.IP \0\0a)
|
||||
The assembly table will be converted to a (C-)routine assemble().
|
||||
assemble() gets as argument a string, the assembler instruction,
|
||||
and can use the MNEMONIC to execute the corresponding action in the
|
||||
assembly table.
|
||||
.IP \0\0b)
|
||||
The routine assemble() can now be used to convert the EM-code table to
|
||||
a set of C-routines, wich together form an instance of the
|
||||
EM_CODE(3L).
|
||||
.SH
|
||||
The EM-instruction table
|
||||
.LP
|
||||
We use the following grammar:
|
||||
.sp
|
||||
.TS
|
||||
center box ;
|
||||
l.
|
||||
TABLE ::= (ROW)*
|
||||
ROW ::= C_instr ( SPECIAL | SIMPLE)
|
||||
SPECIAL ::= ( CONDITION SIMPLE)+ 'default' SIMPLE
|
||||
SIMPLE ::= '==>' ACTIONLIST | '::=' ACTIONLIST
|
||||
ACTIONLIST ::= [ ACTION ( ';' ACTION)* ] '.'
|
||||
ACTION ::= function-call | assembly-instruction
|
||||
.TE
|
||||
.LP
|
||||
An example for the 8086:
|
||||
.LP
|
||||
.DS
|
||||
C_lxl
|
||||
$arg1 == 0 ==> "push bp".
|
||||
$arg1 == 1 ==> "push EM_BSIZE(bp)".
|
||||
default ==> "mov cx, $arg1";
|
||||
"mov si, bp";
|
||||
"1: mov si, EM_BSIZE(si);
|
||||
"loop 1b"
|
||||
"push si".
|
||||
.DE
|
||||
.sp
|
||||
Some remarks:
|
||||
.sp
|
||||
* The C_instr is a function indentifier in the EM_CODE(3L)-interface.
|
||||
.LP
|
||||
* CONDITION is a "boolean" C-expression.
|
||||
.LP
|
||||
* The arguments of an EM-instruction can be used in CONDITION and in assembly
|
||||
instructions. They are referred by $arg\fIi\fR. \fBceg\fR modifies the
|
||||
arguments as follows:
|
||||
.IP \0\0-
|
||||
For local variables at positive offsets it increases this offset by EM_BSIZE
|
||||
.IP \0\0-
|
||||
It makes names en labels unique. The user must supply the formats (see mach.h).
|
||||
.LP
|
||||
* function-call is allowed to implement e.g. push/pop optimization.
|
||||
For example:
|
||||
.LP
|
||||
.DS
|
||||
C_adi
|
||||
$arg1 == 2 ==> combine( "pop ax");
|
||||
combine( "pop bx");
|
||||
"add ax, bx";
|
||||
save( "push ax").
|
||||
default ==> arg_error( "C_adi", $arg1).
|
||||
.DE
|
||||
.LP
|
||||
* The C-functions called in the EM-instructions table have to use the routine
|
||||
assemble()/gen?(). "assembler-instr" is in fact assemble( "assembler-instr").
|
||||
.LP
|
||||
* \fBceg\fR takes care not only about the conversions of arguments but also
|
||||
about
|
||||
changes between segments. There are situation when one doesn't want
|
||||
conversion of arguments. This can be done by using ::= in stead of ==>.
|
||||
This is usefull when two C_instr are equivalent. For example:
|
||||
.IP
|
||||
C_slu ::= C_sli( $arg1)
|
||||
.LP
|
||||
* There are EM-CODE instructions wich are machine independent (e.g. C_open()).
|
||||
For these EM_CODE instructions \fBceg\fR will generate \fIdefault\fR-
|
||||
instructions. There is one exception: in the case of C_pro() the tablewriter
|
||||
has to supply a function prolog().
|
||||
.LP
|
||||
* Also the EM-pseudoinstructions C_bss_\fIcstp\fR(), C_hol_\fIcstp\fR(),
|
||||
C_con_\fIcstp\fR() and C_rom_\fIcstp\fR can be translated automaticly.
|
||||
\fBceg\fR only has to know how to interpretate string-constants:
|
||||
.DS
|
||||
\&..icon $arg2 == 1 ==> gen1( (char) atoi( $arg1))
|
||||
$arg2 == 2 ==> gen2( atoi( $arg1))
|
||||
$arg2 == 4 ==> gen4( atol( $arg1))
|
||||
\&..ucon $arg2 == 1 ==> gen1( (char) atoi( $arg1))
|
||||
$arg2 == 2 ==> gen2( atoi( $arg1))
|
||||
$arg2 == 4 ==> gen4( atol( $arg1))
|
||||
\&..fcon ::= not_implemented( "..fcon")
|
||||
.DE
|
||||
.LP
|
||||
* Still, life can be made easier for the tablewriter; For the routines wich
|
||||
he/she didn't implement \fBceg\fR will generate a default instruction wich
|
||||
generates an error-message. \fBceg\fR seems to generate :
|
||||
.IP
|
||||
C_xxx ::= not_implemented( "C_xxx")
|
||||
.SH
|
||||
The assembly table
|
||||
.LP
|
||||
How to map assembly on objectcode.
|
||||
.LP
|
||||
Each row in the table consists of two fields, one field for the assembly
|
||||
instruction, the other field for the corresponding objectcode. The tablewriter
|
||||
can use the following primitives to generate code for the machine
|
||||
instructions :
|
||||
.IP "\0\0gen1( b)\0\0:" 17
|
||||
generates one byte in de machine independent objectfile.
|
||||
.IP "\0\0gen2( w)\0\0:" 17
|
||||
generates one word ( = two bytes), the table writer can change the byte
|
||||
order by setting the flag BYTES_REVERSED.
|
||||
.IP "\0\0gen4( l)\0\0:" 17
|
||||
generates two words ( = four bytes), the table writer can change the word
|
||||
order by setting the flag WORDS_REVERSED.
|
||||
.IP "\0\0reloc( n, o, r)\0\0:" 17
|
||||
generates relocation information for a label ( = name + offset +
|
||||
relocationtype).
|
||||
.LP
|
||||
Besides these primitives the table writer may use his self written
|
||||
C-functions. This allows the table writer e.g. to write functions to set
|
||||
bitfields within a byte.
|
||||
.LP
|
||||
There are more or less two methods to encode the assembly instructions:
|
||||
.IP \0\0a)
|
||||
MNEMONIC and OPERAND('s) are encoded independently of each other. This can be
|
||||
done when the target machine has an orthogonal instruction set (e.g. pdp-11).
|
||||
.IP \0\0b)
|
||||
MNEMONIC and OPERAND('s) together determine the opcode. In this case the
|
||||
assembler often uses overloading: one MNEMONIC is used for several
|
||||
different machine-instructions. For example : (8086)
|
||||
.br
|
||||
mov ax, bx
|
||||
.br
|
||||
mov ax, variable
|
||||
.br
|
||||
These instructions have different opcodes.
|
||||
.LP
|
||||
As the transformation MNEMONIC-OPCODE is not one to
|
||||
one the table writer must be allowed to put restrictions on the operands.
|
||||
This can be done with type declarations. For example:
|
||||
.LP
|
||||
.DS
|
||||
mov dst:REG, src:MEM ==>
|
||||
gen1( 0x8b);
|
||||
modRM( op2.reg, op1);
|
||||
.DE
|
||||
.DS
|
||||
mov dst:REG, src:REG ==>
|
||||
gen1( 0x89);
|
||||
modRM( op2.reg, op1);
|
||||
.DE
|
||||
.LP
|
||||
modRM() is a function written by the tablewriter and is used to encode
|
||||
the operands. This frees the table writer of endless typing.
|
||||
.LP
|
||||
The table writer has to do the "typechecking" by himself. But typechecking
|
||||
is almost the same as operand decoding. So it's more efficient to do this
|
||||
in one function. We now have all the tools to describe the function
|
||||
assemble().
|
||||
.IP
|
||||
assemble() first calls the function
|
||||
decode_operand() ( by the table writer written), with two arguments: a
|
||||
string ( the operand) and a
|
||||
pointer to a struct. The struct is declared by the table writer and must
|
||||
consist of at least a field called type. ( the other fields in the struct can
|
||||
be used to remember information about the decoded operand.) Now assemble()
|
||||
fires a row wich is selected by mapping the MNEMONIC and the type of the
|
||||
operands.
|
||||
.br
|
||||
In the second field of a row there may be references to other
|
||||
fields in the struct (e.g. op2.reg in the example above).
|
||||
.LP
|
||||
We ignored one problem. It's possible when the operands are encoded, that
|
||||
not everything is known. For example $arg\fIi\fR arguments in the
|
||||
EM-instruction table get their value at runtime. This problem is solved by
|
||||
introducing a function eval(). eval() has a string as argument and returns
|
||||
an arith. The string consists of constants and/or $arg\fIi\fR's and the value
|
||||
returned by eval() is the value of the string. To encode the $arg\fIi\fR's
|
||||
in as few bytes as possible the table writer can use the statements %if,
|
||||
%else and %endif. They can be used in the same manner as #if, #else and
|
||||
#endif in C and result in a runtime test. An example :
|
||||
.LP
|
||||
.DS
|
||||
-- Some rows of the assembly table
|
||||
|
||||
mov dst:REG, src:DATA ==>
|
||||
%if sfit( eval( src), 8) /* does the immediate-data fit in 1 byte? */
|
||||
R53( 0x16 , op1.reg);
|
||||
gen1( eval( src));
|
||||
%else
|
||||
R53( 0x17 , op1.reg);
|
||||
gen2( eval( src));
|
||||
%endif
|
||||
.LD
|
||||
|
||||
mov dst:REG, src:REG ==>
|
||||
gen1( 0x8b);
|
||||
modRM( op1.reg, op2);
|
||||
|
||||
.DE
|
||||
.DS
|
||||
-- The corresponding part in the function assemble() :
|
||||
|
||||
case MNEM_mov :
|
||||
decode_operand( arg1, &op1);
|
||||
decode_operand( arg2, &op2);
|
||||
if ( REG( op1.type) && DATA( op2.type)) {
|
||||
printf( "if ( sfit( %s, 8)) {\\\\n", eval( src));
|
||||
R53( 0x16 , op1.reg);
|
||||
printf( "gen1( %s)\\\\n", eval( arg2));
|
||||
printf( "}\\\\nelse {\\\\n");
|
||||
R53( 0x17 , op1.reg);
|
||||
printf( "gen2( %s)\\\\n", eval( arg2));
|
||||
printf( "}\\\\n");
|
||||
}
|
||||
else if ( REG( op1.type) && REG( op2.type)) {
|
||||
gen1( 0x8b);
|
||||
modRM( op1.reg, op2);
|
||||
}
|
||||
|
||||
|
||||
.DE
|
||||
.DS
|
||||
-- Some rows of the right part of the EM-instruction table are translated
|
||||
-- in the following C-functions.
|
||||
|
||||
"mov ax, $arg1" ==>
|
||||
if ( sfit( w, 8)) { /* w is the actual argument of C_xxx( w) */
|
||||
gen1( 176); /* R53() */
|
||||
gen1( w);
|
||||
}
|
||||
else {
|
||||
gen1( 184);
|
||||
gen2( w);
|
||||
}
|
||||
.LD
|
||||
|
||||
"mov ax, bx" ==>
|
||||
gen1( 138);
|
||||
gen1( 99); /* modRM() */
|
||||
.DE
|
||||
.SH
|
||||
Restrictions
|
||||
.LP
|
||||
.IP \0\01)
|
||||
The EM-instructions C_exc() is not implemented.
|
||||
.IP \0\03)
|
||||
All messages are ignored.
|
||||
Reference in New Issue
Block a user