|
|
|
|
|
本文介紹如何使用 fastCSV 讀寫CSV文件。fastCSV 是一個 CSV 解析器,可以滿足我們對小、快速和易于使用的要求。
文章內(nèi)容目錄
fastCSV 特征
CSV 標準
請參閱文章:
下面是一點摘要
使用代碼
下面是一些如何使用 fastCSV 的例子:
public class car
{
// you can use fields or properties
public string Year;
public string Make;
public string Model;
public string Description;
public string Price;
}
// listcars = List<car>
var listcars = fastCSV.ReadFile<cars>(
"csvstandard.csv", // filename
true, // has header
',', // delimiter
(o, c) => // to object function o : car object, c : columns array read
{
o.Year = c[0];
o.Make = c[1];
o.Model = c[2];
o.Description = c[3];
o.Price = c[4];
// add to list
return true;
});
fastCSV.WriteFile<LocalWeatherData>(
"filename2.csv", // filename
new string[] { "WBAN", "Date", "SkyCondition" }, // headers defined or null
'|', // delimiter
list, // list of LocalWeatherData to save
(o, c) => // from object function
{
c.Add(o.WBAN);
c.Add(o.Date.ToString("yyyyMMdd"));
c.Add(o.SkyCondition);
});
性能輔助函數(shù)
fastCSV 具有以下輔助功能:
int
int
DateTime
即yyyy-MM-ddTHH:mm:ss
(可選部分.nnnZ
)實例代碼
public class LocalWeatherData
{
public string WBAN;
public DateTime Date;
public string SkyCondition;
}
var list = fastCSV.ReadFile<LocalWeatherData>("201503hourly.txt", true, ',', (o, c) =>
{
bool add = true;
o.WBAN = c[0];
// c[1] data is in "20150301" format
o.Date = new DateTime(fastCSV.ToInt(c[1], 0, 4),
fastCSV.ToInt(c[1], 4, 2),
fastCSV.ToInt(c[1], 6, 2));
o.SkyCondition = c[4];
//if (o.Date.Day % 2 == 0)
// add = false;
return add;
});
使用場景
return false;
過濾掉不需要的行。return false;
List<T>
List<T>
,它與CSV文件的列和 sum/min/max/avg 等讀取的行無關(guān)。代碼里面
本質(zhì)上,讀取是一個循環(huán),通過解析一行,為列表創(chuàng)建一個通用元素,將創(chuàng)建的對象和從行中提取的列傳遞給用戶定義的映射函數(shù)并將其添加到列表以返回(如果映射函數(shù)這么說):
var c = ParseLine(line, delimiter, cols);
T o = new T();
var b = mapper(o, c);
if (b)
list.Add(o);
現(xiàn)在 CSV 標準的復(fù)雜性來自正確處理多行,這是通過計算一行中是否有奇數(shù)個引號來完成的,因此它是多行并讀取行直到引號為偶數(shù),這是在ReadFile()
函數(shù)中完成的。
這種方法的美妙之處在于它簡單,不進行反射并且非???,控制權(quán)在用戶手中。
所有讀取代碼如下:
public static List<T> ReadFile<T>(string filename, bool hasheader, char delimiter, ToOBJ<T> mapper) where T : new()
{
string[] cols = null;
List<T> list = new List<T>();
int linenum = -1;
StringBuilder sb = new StringBuilder();
bool insb = false;
foreach (var line in File.ReadLines(filename))
{
try
{
linenum++;
if (linenum == 0)
{
if (hasheader)
{
// actual col count
int cc = CountOccurence(line, delimiter);
if (cc == 0)
throw new Exception("File does not have '" + delimiter + "' as a delimiter");
cols = new string[cc + 1];
continue;
}
else
cols = new string[_COLCOUNT];
}
var qc = CountOccurence(line, '\"');
bool multiline = qc % 2 == 1 || insb;
string cline = line;
// if multiline add line to sb and continue
if (multiline)
{
insb = true;
sb.Append(line);
var s = sb.ToString();
qc = CountOccurence(s, '\"');
if (qc % 2 == 1)
{
sb.AppendLine();
continue;
}
cline = s;
sb.Clear();
insb = false;
}
var c = ParseLine(cline, delimiter, cols);
T o = new T();
var b = mapper(o, c);
if (b)
list.Add(o);
}
catch (Exception ex)
{
throw new Exception("error on line " + linenum, ex);
}
}
return list;
}
private unsafe static int CountOccurence(string text, char c)
{
int count = 0;
int len = text.Length;
int index = -1;
fixed (char* s = text)
{
while (index++ < len)
{
char ch = *(s + index);
if (ch == c)
count++;
}
}
return count;
}
private unsafe static string[] ParseLine(string line, char delimiter, string[] columns)
{
//return line.Split(delimiter);
int col = 0;
int linelen = line.Length;
int index = 0;
fixed (char* l = line)
{
while (index < linelen)
{
if (*(l + index) != '\"')
{
// non quoted
var next = line.IndexOf(delimiter, index);
if (next < 0)
{
columns[col++] = new string(l, index, linelen - index);
break;
}
columns[col++] = new string(l, index, next - index);
index = next + 1;
}
else
{
// quoted string change "" -> "
int qc = 1;
int start = index;
char c = *(l + ++index);
// find matching quote until delim or EOL
while (index++ < linelen)
{
if (c == '\"')
qc++;
if (c == delimiter && qc % 2 == 0)
break;
c = *(l + index);
}
columns[col++] = new string(l, start + 1, index - start - 3).Replace("\"\"", "\"");
}
}
}
return columns;
}
ParseLine()
負責(zé)以優(yōu)化的unsafe
方式從一行中提取列。
而寫入文件代碼則是:
public static void WriteFile<T>(string filename, string[] headers, char delimiter, List<T> list, FromObj<T> mapper)
{
using (FileStream f = new FileStream(filename, FileMode.Create, FileAccess.Write))
{
using (StreamWriter s = new StreamWriter(f))
{
if (headers != null)
s.WriteLine(string.Join(delimiter.ToString(), headers));
foreach (var o in list)
{
List<object> cols = new List<object>();
mapper(o, cols);
for (int i = 0; i < cols.Count; i++)
{
// qoute string if needed -> \" \r \n delim
var str = cols[i].ToString();
bool quote = false;
if (str.IndexOf('\"') >= 0)
{
quote = true;
str = str.Replace("\"", "\"\"");
}
if (quote == false && str.IndexOf('\n') >= 0)
quote = true;
if (quote == false && str.IndexOf('\r') >= 0)
quote = true;
if (quote == false && str.IndexOf(delimiter) >= 0)
quote = true;
if (quote)
s.Write("\"");
s.Write(str);
if (quote)
s.Write("\"");
if (i < cols.Count - 1)
s.Write(delimiter);
}
s.WriteLine();
}
s.Flush();
}
f.Close();
}
}
示例用例
在數(shù)據(jù)科學(xué)中,你通常將數(shù)據(jù)拆分為訓(xùn)練集和測試集,在下面的示例中,每 3 行用于測試(你可以更詳細地拆分):
var testing = new List<LocalWeatherData>();
int line = 0;
var training = fastCSV.ReadFile<LocalWeatherData>("201503hourly.txt", true, ',', (o, c) =>
{
bool add = true;
line++;
o.Date = new DateTime(fastCSV.ToInt(c[1], 0, 4),
fastCSV.ToInt(c[1], 4, 2),
fastCSV.ToInt(c[1], 6, 2));
o.SkyCondition = c[4];
if (line % 3 == 0)
{
add = false;
testing.Add(o);
}
return add;
});
性能表現(xiàn)
新的性能數(shù)字如下所示,與 v1 代碼相比,在相同的 4,496,263 行數(shù)據(jù)集上以稍微多一點的內(nèi)存使用為代價快了近2 倍。
有趣的是,在 .net core上,該庫使用的內(nèi)存更少。
下載 fastCSV
本站提供 fastCSV 源碼直接下載。
總結(jié)
本文介紹了如何使用 fastCSV 讀寫CSV文件。fastCSV 是一個 CSV 解析器,可以滿足我們對小、快速和易于使用的要求。.NET 4.0編譯 fastCSV 后,它的dll只有8KB。
相關(guān)文章