我正在开发一个词法分析器,它将读取cm汇编并创建标记,而且在大多数情况下,它运行得非常完美。我只有两个小问题。首先也是最重要的是,我很难区分机器代码和行号。因为我没有弄错,如果第一个字符是一个数字,并且它的长度小于2,我将它指定为行号。这里的问题是,这也是机器代码的描述。实际上,行号可以超过2,机器代码不一定要以数字开头,因为它是十六进制的。我很难提出满足这些条件的条件。第二个问题我有一个非常小的一个,当谈到评论,它会读每一个字符,除了最后一个。请在下面找到输入示例和我的代码。非常感谢大家的帮助,非常感谢!
import java.util.List;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
public class Lexer
{
private static File srcFile = null ;
private static String srcFilename = "<srcFilename>";
public static enum Type
{
//This Assembly code has 5 token types (I am a little confused about this, the documentation released does not specify or at least
//go into great detail about about addresses and offsets, and even mentions that they are excluded from Cm Assembly, so I'm not sure
//what to do for those
MNUMONIC_NAMES, LABELS, ADDRESSES, OFFSETS, COMMENTS, LINE_NUMBER;
}
//This class creates an object of type token
public static class Token
{
public final Type t;
public final String c;
//constructor (set type)
public Token(Type t, String c)
{
this.t = t;
this.c = c;
}
//toString
public String toString()
{
if(t == Type.MNUMONIC_NAMES)
{
return "MNUMONIC_NAMES<" + c + ">";
}
if(t == Type.LABELS)
{
return "LABELS<" + c + ">";
}
if(t == Type.ADDRESSES)
{
return "ADDRESSES<" + c + ">";
}
if(t == Type.OFFSETS)
{
return "OFFSETS<" + c + ">";
}
if(t == Type.COMMENTS)
{
return "COMMENTS<" + c + ">";
}
if(t == Type.LINE_NUMBER)
{
return "LINE_NUMBER<" + c + ">";
}
return t.toString();
}
}
//Given a String and an index, get the word/atom starting at index (this will be used for mnemonics and labels,
//since they are both single words
public static String getAtom(String s, int i)
{
int j = i;
for(;j < s.length(); )
{
//while character is a letter, continue
if(Character.isLetter(s.charAt(j)) || Character.isDigit(s.charAt(j)) )
{
j++;
}
else
{
return s.substring(i, j);
}
}
return s.substring(i,j);
}
//Given a String and an index, get the comment starting at index
//I am having trouble with this one, it returns every word except the last letter of the last word
//If I remove the -1 from the length, it goes out of bounds.
public static String getComment(String s, int i)
{
return s.substring( i , (s.length()-1) );
}
//method creates and returns a List of Tokens
public static List<Token> lex(String input)
{
List<Token> result = new ArrayList<Token>();
for(int i = 0;i < input.length();)
{
switch(input.charAt(i))
{
//case comment
case ';':
//System.out.println(input);
String comment = getComment(input, i);
i+=comment.length();
result.add(new Token(Type.COMMENTS, comment));
//if it is a number
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
String number = getAtom(input, i);
i+=number.length();
if(number.length() < 4)
{
result.add(new Token(Type.LINE_NUMBER, number));
}
else
{
result.add(new Token(Type.ADDRESSES, number));
}
//if not a comment it is either a label or mnemonic (still uncertain about addresses and offsets)
default:
//if white space, continue
if(Character.isWhitespace(input.charAt(i)))
{
i++;
}
//now check for the atom
else
{
String atom = getAtom(input, i);
i += atom.length();
//check if it is a mnemonic (as specified by doc 3, pages 7 -> 8)
if(Check.isMnem(atom))
{
result.add(new Token(Type.MNUMONIC_NAMES, atom));
}
//else it is a label
else
{
result.add(new Token(Type.LABELS, atom));
}
}
break;
}
}
//return list
return result;
}
//main just to test functionality, will run in terminal/command line
public static void main(String[] args) throws IOException {
/*
if(args.length < 1) {
System.out.println("Usage: java Lexer \"((some Scheme) (code to) lex)\".");
return;
}
if (args[0] != null)
{
//check <src>
srcFilename = args[0];
System.out.println("charcount: srcFilename '" + srcFilename + "'");
srcFile = new File(srcFilename);
if(!srcFile.canRead())
{
System.out.println("charcount: cannot open srcFile '" + srcFilename + "'");
return;
}
}
else
{
System.out.println("charcount: [OK] srcFilename = '" + srcFilename + "'");
}
*/
srcFilename = "C:\\Users\\abdcg\\Desktop\\School\\Concordia\\Semester 4\\SOEN 341\\Project B\\Sprint 1\\Lexer test\\Test 2.txt";
srcFile = new File(srcFilename);
//Scanner scanny = new Scanner(srcFile);
FileReader fr = new FileReader(srcFile);
BufferedReader br = new BufferedReader(fr);
String line;
while((line = br.readLine()) != null)
{
List<Token> tokens = lex(line);
for(Token t : tokens) {
System.out.println(t);
}
}
//while scanner hasNext(), send the entire line to lex
/*
while(scanny.hasNext())
{
List<Token> tokens = lex(scanny.nextLine());
for(Token t : tokens) {
System.out.println(t);
}
}
*/
}
}
Line Addr Machine Code Label Assembly Code Comments
1 0000 00 halt
2 0001 01 pop
3 0002 02 dup
4 0003 03 exit
5 0004 04 ret
6 0005 0C not
7 0006 0D and
8 0007 0E or
9 0008 0F xor
10 0009 10 neg
11 000A 11 inc
12 000B 12 dec
13 000C 13 add
14 000D 14 sub
15 000E 15 mul
16 000F 16 div
17 0010 17 rem
18 0011 18 shl
19 0012 19 shr
20 0013 1A teq
21 0014 1B tne
22 0015 1C tlt
23 0016 1D tgt
24 0017 1E tle
25 0018 1F tge
26 0019 00 halt
1条答案
按热度按时间rqqzpn5f1#
就我所知,你在问行号、地址和机器代码之间是否应该有更多的区别——除了彼此相差1,一个是十进制的,另一个是十六进制的。
行号是用户输入文件中的行号,我认为这很简单。
addr应该在行首打印当前值,并在汇编代码将指令输入到输出时前进。 考虑以下条件(如果系统允许):
机器代码应该是与行上的装配指令相对应的十六进制值。 对于多字节指令,应该打印所有字节。