line = String[];
if ('"' in row){
vals = row.split('"');
for (int i =0; i<vals.length();i+=2){
line+=vals[i].split(',');
}
for (int j=1; j<vals.length();j+=2){
line+=vals[j];
}
}
else{
line = row.split(',')
}
package bestsss.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class SplitCSVLine {
public static String[] splitCSV(BufferedReader reader) throws IOException{
return splitCSV(reader, null, ',', '"');
}
/**
*
* @param reader - some line enabled reader, we lazy
* @param expectedColumns - convenient int[1] to return the expected
* @param separator - the C(omma) SV (or alternative like semi-colon)
* @param quote - double quote char ('"') or alternative
* @return String[] containing the field
* @throws IOException
*/
public static String[] splitCSV(BufferedReader reader, int[] expectedColumns, char separator, char quote) throws IOException{
final List<String> tokens = new ArrayList<String>(expectedColumns==null?8:expectedColumns[0]);
final StringBuilder sb = new StringBuilder(24);
for(boolean quoted=false;;sb.append('\n')) {//lazy, we do not preserve the original new line, but meh
final String line = reader.readLine();
if (line==null)
break;
for (int i = 0, len= line.length(); i < len; i++) {
final char c = line.charAt(i);
if (c == quote) {
if( quoted && i<len-1 && line.charAt(i+1) == quote ){//2xdouble quote in quoted
sb.append(c);
i++;//skip it
}else{
if (quoted){
//next symbol must be either separator or eol according to RFC 4180
if (i==len-1 || line.charAt(i+1) == separator){
quoted = false;
continue;
}
} else{//not quoted
if (sb.length()==0){//at the very start
quoted=true;
continue;
}
}
//if fall here, bogus, just add the quote and move on; or throw exception if you like to
/*
5. Each field may or may not be enclosed in double quotes (however
some programs, such as Microsoft Excel, do not use double quotes
at all). If fields are not enclosed with double quotes, then
double quotes may not appear inside the fields.
*/
sb.append(c);
}
} else if (c == separator && !quoted) {
tokens.add(sb.toString());
sb.setLength(0);
} else {
sb.append(c);
}
}
if (!quoted)
break;
}
tokens.add(sb.toString());//add last
if (expectedColumns !=null)
expectedColumns[0] = tokens.size();
return tokens.toArray(new String[tokens.size()]);
}
public static void main(String[] args) throws Throwable{
java.io.StringReader r = new java.io.StringReader("222,\"\"\"zzzz\", abc\"\" , 111 ,\"1\n2\n3\n\"");
System.out.println(java.util.Arrays.toString(splitCSV(new BufferedReader(r))));
}
}
final static Pattern quote = Pattern.compile("^\\s*\"((?:[^\"]|(?:\"\"))*?)\"\\s*,");
public static List<String> parseCsv(String line) throws Exception
{
List<String> list = new ArrayList<String>();
line += ",";
for (int x = 0; x < line.length(); x++)
{
String s = line.substring(x);
if (s.trim().startsWith("\""))
{
Matcher m = quote.matcher(s);
if (!m.find())
throw new Exception("CSV is malformed");
list.add(m.group(1).replace("\"\"", "\""));
x += m.end() - 1;
}
else
{
int y = s.indexOf(",");
if (y == -1)
throw new Exception("CSV is malformed");
list.add(s.substring(0, y));
x += y;
}
}
return list;
}
def parserow(line):
''' this splits the input line on commas ',' but allowing commas within fields
if they are within double quotes '"'
example:
fieldname1,fieldname2,fieldname3
field value1,"field, value2, allowing, commas", field value3
gives:
['field value1','"field, value2, allowing, commas"', ' field value3']
'''
out = []
current_field = ''
within_quote = False
for c in line:
if c == '"':
within_quote = not within_quote
if c == ',':
if not within_quote:
out.append(current_field)
current_field = ''
continue
current_field += c
if len(current_field) != 0:
out.append(current_field)
return out
5条答案
按热度按时间brgchamk1#
可以将
Matcher.find
与以下正则表达式配合使用:下面是一个更完整的示例:
在线查看工作情况:ideone
k2arahey2#
我遇到过同样的问题(但用Python),我发现一种不用正则表达式的解决方法是:当你得到这一行时,检查是否有引号,如果有引号,用引号分割字符串,用逗号分割结果数组的偶数索引结果,奇数索引字符串应该是完整的引号值。
我不是Java程序员,所以把它当作伪代码...
或者,使用正则表达式。
px9o7tmv3#
这里有一些代码,我希望在这里使用的代码不包括开源代码。
yvt65v4c4#
下面的代码看起来运行良好,可以处理引号中的引号。
mutmk8jj5#
这是我用Python编写的解决方案,它可以处理单引号。