From 29f7aa37dab509f52080530ddee4555db63d066b Mon Sep 17 00:00:00 2001 From: Simon Diesenreiter Date: Tue, 3 Dec 2024 18:07:55 +0100 Subject: [PATCH] feat: initial support for fragment parsing, ref: NOISSUE --- TextParser.Tests/TextParserTests.cs | 63 ++++++++- TextParser/LineParser.cs | 8 +- TextParser/Schema/BlockType.cs | 9 +- .../BuildingBlocks/FixedRepetitionBlock.cs | 3 +- .../BuildingBlocks/GreedyRepetitionBlock.cs | 3 +- TextParser/Schema/FragmentSchema.cs | 84 ++++++++++++ TextParser/Schema/FragmentSchemaBuilder.cs | 128 ++++++++++++++++++ TextParser/Schema/ISchema.cs | 25 ++++ TextParser/Schema/ISchemaBuilder.cs | 8 ++ TextParser/Schema/InputSchema.cs | 12 +- TextParser/Schema/InputSchemaBuilder.cs | 8 +- TextParser/Schema/InputType.cs | 4 +- TextParser/Schema/RepetitionSchemaBuilder.cs | 10 +- TextParser/Schema/RepetitionType.cs | 4 +- TextParser/TextParser.cs | 12 +- TextParser/TokenConverter.cs | 98 ++++++++++++++ TextParser/Tokenization/FragmentToken.cs | 38 ++++++ TextParser/Tokenization/InputProvider.cs | 6 + 18 files changed, 489 insertions(+), 34 deletions(-) create mode 100644 TextParser/Schema/FragmentSchema.cs create mode 100644 TextParser/Schema/FragmentSchemaBuilder.cs create mode 100644 TextParser/Schema/ISchema.cs create mode 100644 TextParser/Schema/ISchemaBuilder.cs create mode 100644 TextParser/Tokenization/FragmentToken.cs diff --git a/TextParser.Tests/TextParserTests.cs b/TextParser.Tests/TextParserTests.cs index 75e6372..6a4af45 100644 --- a/TextParser.Tests/TextParserTests.cs +++ b/TextParser.Tests/TextParserTests.cs @@ -26,7 +26,7 @@ public class TextParserTests .EndRepetition() .Build(); - var parser = new LineParser(schema); + var parser = new LineParser(schema); var tokens = parser.ParseLine(testInput1); Assert.Equal(4, tokens.Count); @@ -51,7 +51,7 @@ public class TextParserTests .Expect(InputType.Integer) .Build(); - var parser = new LineParser(schema); + var parser = new LineParser(schema); var tokens = parser.ParseLine(testInput1); Assert.Equal(4, tokens.Count); @@ -79,7 +79,7 @@ public class TextParserTests .EndRepetition() .Build(); - var parser = new LineParser(schema); + var parser = new LineParser(schema); var tokens = parser.ParseLine(testInput2); Assert.Equal(6, tokens.Count); @@ -107,7 +107,7 @@ public class TextParserTests .EndRepetition() .Build(); - var parser = new TextParser(schema); + var parser = new TextParser(schema); var rows = parser .SetInputText(testInput3) .Parse() @@ -139,7 +139,7 @@ public class TextParserTests .EndRepetition() .Build(); - var parser = new TextParser(schema); + var parser = new TextParser(schema); var columns = parser .SetInputText(testInput3) .Parse() @@ -174,7 +174,7 @@ public class TextParserTests .EndRepetition() .Build(); - var parser = new TextParser(schema); + var parser = new TextParser(schema); var rows = parser .SetInputText(testInput4) .Parse() @@ -199,4 +199,55 @@ public class TextParserTests Assert.Equal(InputType.String, rows[2][3].GetInputType()); Assert.Equal(InputType.String, rows[2][4].GetInputType()); } + + [Fact] + public void FragmentParser_SimpleTest() + { + var schemaBuilder = new FragmentSchemaBuilder(); + var schema = schemaBuilder + .StartOptions() + .Option() + .Expect("nums(") + .Expect(InputType.Integer) + .Repeat() + .Expect(",") + .Expect(InputType.Integer) + .EndRepetition() + .Expect(")") + .Option() + .Expect("strs(") + .Expect(InputType.String) + .Repeat() + .Expect(",") + .Expect(InputType.String) + .EndRepetition() + .Expect(")") + .EndOptions() + .Build(); + + var parser = new TextParser(schema); + var rows = parser + .SetInputText(testInput4) + .Parse() + .AsFragments(); + + Assert.Equal(3, rows.Count); + Assert.Equal(6, rows[0].Count); + Assert.Equal(3, rows[1].Count); + Assert.Equal(5, rows[2].Count); + // Assert.Equal(InputType.Integer, rows[0][0].GetInputType()); + // Assert.Equal(InputType.String, rows[0][1].GetInputType()); + // Assert.Equal(InputType.String, rows[0][2].GetInputType()); + // Assert.Equal(InputType.String, rows[0][3].GetInputType()); + // Assert.Equal(InputType.String, rows[0][4].GetInputType()); + // Assert.Equal(InputType.String, rows[0][5].GetInputType()); + // Assert.Equal(InputType.Integer, rows[1][0].GetInputType()); + // Assert.Equal(InputType.String, rows[1][1].GetInputType()); + // Assert.Equal(InputType.String, rows[1][2].GetInputType()); + // Assert.Equal(InputType.Integer, rows[2][0].GetInputType()); + // Assert.Equal(InputType.String, rows[2][1].GetInputType()); + // Assert.Equal(InputType.Integer, rows[2][2].GetInputType()); + // Assert.Equal(InputType.String, rows[2][3].GetInputType()); + // Assert.Equal(InputType.String, rows[2][4].GetInputType()); + } } diff --git a/TextParser/LineParser.cs b/TextParser/LineParser.cs index 1a1ce58..2fb8ae5 100644 --- a/TextParser/LineParser.cs +++ b/TextParser/LineParser.cs @@ -3,14 +3,14 @@ namespace Parsing; using Parsing.Schema; using Parsing.Tokenization; -public class LineParser +public class LineParser where T : ISchemaContext { private string[] delimiters; private bool removeEmptyEntries = false; - private InputSchema schema; - private InputSchemaContext context; + private ISchema schema; + private T context; - public LineParser(InputSchema schema, string[]? delimiters = null, bool removeEmptyEntries = true) + public LineParser(ISchema schema, string[]? delimiters = null, bool removeEmptyEntries = true) { this.delimiters = delimiters ?? new string[] { " " }; this.removeEmptyEntries = removeEmptyEntries; diff --git a/TextParser/Schema/BlockType.cs b/TextParser/Schema/BlockType.cs index c67ba4a..15f0bfd 100644 --- a/TextParser/Schema/BlockType.cs +++ b/TextParser/Schema/BlockType.cs @@ -1,6 +1,13 @@ namespace Parsing.Schema; +[Flags] public enum BlockType { - Integer, String, FixedRepetition, GreedyRepetition + Integer = 1, + String = 2, + // technically not a block type but keeping it here for consistency/having all types in one place + Fragment = 4, + FixedRepetition = 8, + GreedyRepetition = 16, + NonZeroRepetition = 32, } diff --git a/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs b/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs index 33be031..ec5478a 100644 --- a/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs +++ b/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs @@ -1,6 +1,7 @@ namespace Parsing.Schema.BuildingBlocks; using System.IO.Pipelines; +using System.Linq; using Parsing.Tokenization; class FixedRepetitionBlock : BuildingBlockBase @@ -30,7 +31,7 @@ class FixedRepetitionBlock : BuildingBlockBase this.context = this.inputSchema.CreateContext(); } } - return result; + return result.SingleOrDefault(); } public override bool CanParseWord(InputProvider inputs) diff --git a/TextParser/Schema/BuildingBlocks/GreedyRepetitionBlock.cs b/TextParser/Schema/BuildingBlocks/GreedyRepetitionBlock.cs index 59c9260..9de0639 100644 --- a/TextParser/Schema/BuildingBlocks/GreedyRepetitionBlock.cs +++ b/TextParser/Schema/BuildingBlocks/GreedyRepetitionBlock.cs @@ -1,6 +1,7 @@ namespace Parsing.Schema.BuildingBlocks; using System.IO.Pipelines; +using System.Linq; using Parsing.Tokenization; class GreedyRepetitionBlock : BuildingBlockBase @@ -21,7 +22,7 @@ class GreedyRepetitionBlock : BuildingBlockBase { this.context = this.inputSchema.CreateContext(); } - return result; + return result.SingleOrDefault(); } public override bool CanParseWord(InputProvider inputs) diff --git a/TextParser/Schema/FragmentSchema.cs b/TextParser/Schema/FragmentSchema.cs new file mode 100644 index 0000000..87e288c --- /dev/null +++ b/TextParser/Schema/FragmentSchema.cs @@ -0,0 +1,84 @@ +namespace Parsing.Schema; + +using Parsing.Schema; +using Parsing.Schema.BuildingBlocks; +using Parsing.Tokenization; +using System.Collections; +using System.Text.RegularExpressions; + +public class FragmentSchemaContext : ISchemaContext +{ + public int lastProcessedBlockIndex { get; set; } = 0; + public bool HasFinished { get; set; } = false; +} + +public class FragmentSchema : ISchema +{ + private string fragmentRegex; + + public FragmentSchema(string fragmentRegex) + { + this.fragmentRegex = fragmentRegex; + } + + public List ProcessNextWord(FragmentSchemaContext currentContext, InputProvider inputs) + { + Regex r = new Regex(this.fragmentRegex); + + var tokenList = new List(); + // one token per match + foreach (Match match in r.Matches(inputs.YieldWord())) + { + var newToken = new FragmentToken(match.Result("$1")); + // token contains data from all included matches + foreach (var groupKey in match.Groups.Keys) + { + List matchedSubstrings = new List(); + foreach (var capture in match.Groups[groupKey].Captures) + { + //matchedSubstrings.Add(capture.Value); + } + newToken.AddMatch(groupKey, matchedSubstrings); + } + } + + return tokenList; + } + + public bool CanProcessNextWord(FragmentSchemaContext currentContext, InputProvider inputs) + { + using (inputs.GetLookaheadContext()) + { + return this.CanProcessNextWord(currentContext, inputs.YieldWord()); + } + } + + public bool CanProcessNextWord(FragmentSchemaContext currentContext, string word) + { + if (currentContext.HasFinished) + { + return false; + } + Regex r = new Regex(this.fragmentRegex); + return r.Match(word).Success; + } + + public List ProcessWordList(string[] words) + { + List tokens = new List(); + InputProvider inputs = new InputProvider(words); + var overallContext = this.CreateContext(); + + while (this.CanProcessNextWord(overallContext, inputs)) + { + tokens.AddRange(this.ProcessNextWord(overallContext, inputs)); + } + + return tokens; + } + + public FragmentSchemaContext CreateContext() + { + return new FragmentSchemaContext(); + } +} diff --git a/TextParser/Schema/FragmentSchemaBuilder.cs b/TextParser/Schema/FragmentSchemaBuilder.cs new file mode 100644 index 0000000..9988220 --- /dev/null +++ b/TextParser/Schema/FragmentSchemaBuilder.cs @@ -0,0 +1,128 @@ +namespace Parsing.Schema; + +using Parsing.Schema.BuildingBlocks; +using System.Text.RegularExpressions; + +public class FragmentSchemaBuilder : RepetitionSchemaBuilder, ISchemaBuilder +{ + protected string fragmentRegex = @""; + + public FragmentSchemaBuilder() + { + } + + public FragmentSchemaBuilder StartOptions() + { + this.fragmentRegex += "(("; + return this; + } + + public FragmentSchemaBuilder EndOptions() + { + this.fragmentRegex += "))"; + return this; + } + + public FragmentSchemaBuilder Option() + { + // if we just started an options group there is no need to add an option separator + if (!this.fragmentRegex.EndsWith("(") || this.fragmentRegex.EndsWith("\\(")) + { + this.fragmentRegex += ")|("; + } + return this; + } + + public FragmentSchemaBuilder Expect(InputType type, string name = "") + { + string groupNamePrefix = ""; + if (!string.IsNullOrEmpty(name)) + { + groupNamePrefix = "?<" + name + ">"; + } + switch (type) + { + case InputType.String: + this.fragmentRegex += "(" + groupNamePrefix + "\\w+)"; + break; + case InputType.Integer: + this.fragmentRegex += "(" + groupNamePrefix + "\\d+)"; + break; + default: + throw new Exception("Unrecognized InputType"); + } + return this; + } + + public FragmentSchemaBuilder Expect(string literal) + { + this.fragmentRegex += Regex.Escape(literal); + return this; + } + + public FragmentSchemaBuilder Repeat(int repetitionCount) + { + // add another layer of parsing + var newSchemaBuilder = this.GetNewRepetitionSchemaBuilder(this); + newSchemaBuilder.NumRepetition = repetitionCount; + newSchemaBuilder.RepetitionType = RepetitionType.FixedRepetition; + + return newSchemaBuilder; + } + + public FragmentSchemaBuilder Repeat() + { + // add another layer of parsing + var newSchemaBuilder = this.GetNewRepetitionSchemaBuilder(this); + newSchemaBuilder.RepetitionType = RepetitionType.GreedyRepetition; + + return newSchemaBuilder; + } + + public FragmentSchemaBuilder Repeat(RepetitionType repetitionType) + { + // add another layer of parsing + var newSchemaBuilder = this.GetNewRepetitionSchemaBuilder(this); + newSchemaBuilder.RepetitionType = repetitionType; + + return newSchemaBuilder; + } + + public FragmentSchemaBuilder EndRepetition() + { + // return back to upper layer of parsing + var currentBuilder = this as FragmentSchemaBuilder; + if (currentBuilder == null) + { + throw new Exception("Invalid repetition definitions!"); + } + var oldSchemaBuilder = currentBuilder.UpperLayerBuilder; + + var currentRegex = "(" + currentBuilder.fragmentRegex + ")"; + switch (currentBuilder.RepetitionType) + { + case RepetitionType.FixedRepetition: + currentRegex += "{" + this.NumRepetition.ToString() + "}"; + break; + case RepetitionType.GreedyRepetition: + currentRegex += "*"; + break; + case RepetitionType.NonZeroRepetition: + case RepetitionType.NonZeroRepetition | RepetitionType.GreedyRepetition: + currentRegex += "+"; + break; + default: + throw new Exception("Unrecognized RepetitionType"); + } + + oldSchemaBuilder.fragmentRegex += "(" + currentRegex + ")"; + + return oldSchemaBuilder; + } + + public FragmentSchema Build() + { + var schema = new FragmentSchema(this.fragmentRegex); + return schema; + } +} diff --git a/TextParser/Schema/ISchema.cs b/TextParser/Schema/ISchema.cs new file mode 100644 index 0000000..a4f896b --- /dev/null +++ b/TextParser/Schema/ISchema.cs @@ -0,0 +1,25 @@ +namespace Parsing.Schema; + +using Parsing.Schema; +using Parsing.Schema.BuildingBlocks; +using Parsing.Tokenization; +using System.Collections; + +public interface ISchemaContext +{ + public int lastProcessedBlockIndex { get; set; } + public bool HasFinished { get; set; } +} + +public interface ISchema where T : ISchemaContext +{ + public List ProcessNextWord(T currentContext, InputProvider inputs); + + public bool CanProcessNextWord(T currentContext, InputProvider inputs); + + public bool CanProcessNextWord(T currentContext, string word); + + public List ProcessWordList(string[] words); + + public T CreateContext(); +} diff --git a/TextParser/Schema/ISchemaBuilder.cs b/TextParser/Schema/ISchemaBuilder.cs new file mode 100644 index 0000000..8016084 --- /dev/null +++ b/TextParser/Schema/ISchemaBuilder.cs @@ -0,0 +1,8 @@ +namespace Parsing.Schema; + +using Parsing.Schema.BuildingBlocks; + +public interface ISchemaBuilder where T : ISchema where U : ISchemaContext +{ + public T Build(); +} diff --git a/TextParser/Schema/InputSchema.cs b/TextParser/Schema/InputSchema.cs index 7d9041e..3f4e199 100644 --- a/TextParser/Schema/InputSchema.cs +++ b/TextParser/Schema/InputSchema.cs @@ -5,13 +5,13 @@ using Parsing.Schema.BuildingBlocks; using Parsing.Tokenization; using System.Collections; -public class InputSchemaContext +public class InputSchemaContext : ISchemaContext { public int lastProcessedBlockIndex { get; set; } = 0; public bool HasFinished { get; set; } = false; } -public class InputSchema +public class InputSchema : ISchema { private List buildingBlocks; @@ -25,7 +25,7 @@ public class InputSchema this.buildingBlocks.Add(buildingBlock); } - public IToken ProcessNextWord(InputSchemaContext currentContext, InputProvider inputs) + public List ProcessNextWord(InputSchemaContext currentContext, InputProvider inputs) { var nextBlock = this.buildingBlocks[currentContext.lastProcessedBlockIndex]; var token = nextBlock.ParseWord(inputs); @@ -34,7 +34,9 @@ public class InputSchema currentContext.lastProcessedBlockIndex++; currentContext.HasFinished = currentContext.lastProcessedBlockIndex >= this.buildingBlocks.Count; } - return token; + var newTokenList = new List(); + newTokenList.Add(token); + return newTokenList; } public bool CanProcessNextWord(InputSchemaContext currentContext, InputProvider inputs) @@ -68,7 +70,7 @@ public class InputSchema while (this.CanProcessNextWord(overallContext, inputs)) { - tokens.Add(this.ProcessNextWord(overallContext, inputs)); + tokens.AddRange(this.ProcessNextWord(overallContext, inputs)); } return tokens; diff --git a/TextParser/Schema/InputSchemaBuilder.cs b/TextParser/Schema/InputSchemaBuilder.cs index 417b744..bc608f6 100644 --- a/TextParser/Schema/InputSchemaBuilder.cs +++ b/TextParser/Schema/InputSchemaBuilder.cs @@ -2,7 +2,7 @@ using Parsing.Schema.BuildingBlocks; -public class InputSchemaBuilder +public class InputSchemaBuilder : RepetitionSchemaBuilder, ISchemaBuilder { private InputSchema schema = new InputSchema(); @@ -31,7 +31,7 @@ public class InputSchemaBuilder public InputSchemaBuilder Repeat(int repetitionCount) { // add another layer of parsing - var newInputSchemaBuilder = new RepetitionSchemaBuilder(this); + var newInputSchemaBuilder = this.GetNewRepetitionSchemaBuilder(this); newInputSchemaBuilder.NumRepetition = repetitionCount; newInputSchemaBuilder.RepetitionType = RepetitionType.FixedRepetition; @@ -41,7 +41,7 @@ public class InputSchemaBuilder public InputSchemaBuilder Repeat() { // add another layer of parsing - var newInputSchemaBuilder = new RepetitionSchemaBuilder(this); + var newInputSchemaBuilder = this.GetNewRepetitionSchemaBuilder(this); newInputSchemaBuilder.RepetitionType = RepetitionType.GreedyRepetition; return newInputSchemaBuilder; @@ -50,7 +50,7 @@ public class InputSchemaBuilder public InputSchemaBuilder EndRepetition() { // return back to upper layer of parsing - var currentBuilder = this as RepetitionSchemaBuilder; + var currentBuilder = this as InputSchemaBuilder; if (currentBuilder == null) { throw new Exception("Invalid repetition definitions!"); diff --git a/TextParser/Schema/InputType.cs b/TextParser/Schema/InputType.cs index 2977d73..120c2c0 100644 --- a/TextParser/Schema/InputType.cs +++ b/TextParser/Schema/InputType.cs @@ -1,7 +1,9 @@ namespace Parsing.Schema; +[Flags] public enum InputType { Integer = BlockType.Integer, - String = BlockType.String + String = BlockType.String, + Fragment = BlockType.Fragment, } diff --git a/TextParser/Schema/RepetitionSchemaBuilder.cs b/TextParser/Schema/RepetitionSchemaBuilder.cs index 43f828f..1c989ee 100644 --- a/TextParser/Schema/RepetitionSchemaBuilder.cs +++ b/TextParser/Schema/RepetitionSchemaBuilder.cs @@ -1,13 +1,15 @@ namespace Parsing.Schema; -public class RepetitionSchemaBuilder : InputSchemaBuilder +public abstract class RepetitionSchemaBuilder where S : RepetitionSchemaBuilder, ISchemaBuilder, new() where T : ISchema where U : ISchemaContext { - public RepetitionSchemaBuilder(InputSchemaBuilder upperLayerBuilder) + public S GetNewRepetitionSchemaBuilder(S upperLayerBuilder) { - this.UpperLayerBuilder = upperLayerBuilder; + var newBuilder = new S(); + newBuilder.UpperLayerBuilder = upperLayerBuilder; + return newBuilder; } - public InputSchemaBuilder UpperLayerBuilder { get; set; } + public required S UpperLayerBuilder { get; set; } public int NumRepetition { get; set; } diff --git a/TextParser/Schema/RepetitionType.cs b/TextParser/Schema/RepetitionType.cs index 698dcaa..16a9a2c 100644 --- a/TextParser/Schema/RepetitionType.cs +++ b/TextParser/Schema/RepetitionType.cs @@ -1,7 +1,9 @@ namespace Parsing.Schema; +[Flags] public enum RepetitionType { FixedRepetition = BlockType.FixedRepetition, - GreedyRepetition = BlockType.GreedyRepetition + GreedyRepetition = BlockType.GreedyRepetition, + NonZeroRepetition = BlockType.NonZeroRepetition, } diff --git a/TextParser/TextParser.cs b/TextParser/TextParser.cs index b5f9fc6..a689a71 100644 --- a/TextParser/TextParser.cs +++ b/TextParser/TextParser.cs @@ -5,20 +5,20 @@ using System.Collections.Generic; using Parsing.Schema; using Parsing.Tokenization; -public class TextParser : TokenConverter +public class TextParser : TokenConverter where T : ISchemaContext { - private LineParser lineParser; + private LineParser lineParser; private string[] lines; private bool removeEmptyEntries; - public TextParser(InputSchema schema, string[]? delimiters = null, bool removeEmptyEntries = true) : base() + public TextParser(ISchema schema, string[]? delimiters = null, bool removeEmptyEntries = true) : base() { - this.lineParser = new LineParser(schema, delimiters, removeEmptyEntries); + this.lineParser = new LineParser(schema, delimiters, removeEmptyEntries); this.lines = new string[] { }; this.removeEmptyEntries = removeEmptyEntries; } - public TextParser SetInputText(string text) + public TextParser SetInputText(string text) { var options = StringSplitOptions.TrimEntries; if (removeEmptyEntries) @@ -29,7 +29,7 @@ public class TextParser : TokenConverter return this; } - public TextParser Parse() + public TextParser Parse() { foreach (var line in this.lines) { diff --git a/TextParser/TokenConverter.cs b/TextParser/TokenConverter.cs index 6af2797..d1b9cc4 100644 --- a/TextParser/TokenConverter.cs +++ b/TextParser/TokenConverter.cs @@ -5,6 +5,51 @@ using System.Collections.Generic; using Parsing.Schema; using Parsing.Tokenization; +public static class ConversionHelpers +{ + // public static List ConvertData(this List tokenList, Func converter) where T : IValueToken + // { + // var newList = new List(); + // foreach (var token in tokenList) + // { + // var typedToken = token as IValueToken; + // if (typedToken == null) + // { + // throw new Exception("Invalid Token type encountered during value conversion"); + // } + + // newList.Add(converter(typedToken.GetValue())); + // } + // return newList; + // } + + // public static List ConvertData(this List tokenList, Func, V> converter) where T : IValueToken + // { + // var newList = new List(); + // foreach (var token in tokenList) + // { + // var typedToken = token as IValueToken; + // if (typedToken == null) + // { + // throw new Exception("Invalid Token type encountered during value conversion"); + // } + + // newList.AddRange(converter(typedToken.GetValue())); + // } + // return newList; + // } + + // public static List ConvertData(this List> tokenListList, Func converter) where T : IValueToken + // { + // var newListList = new List>(); + // foreach (var tokenList in tokenListList) + // { + // newListList.Add(tokenList.ConvertData(converter)); + // } + // return newListList; + // } +} + public class TokenConverter { protected List> rawTokens = new List>(); @@ -52,6 +97,41 @@ public class TokenConverter } } + public List AsSingleStream() + { + List returnData = new List(); + foreach (var tokenRow in this.rawTokens) + { // Assert.Equal(InputType.Integer, rows[0][0].GetInputType()); + // Assert.Equal(InputType.String, rows[0][1].GetInputType()); + // Assert.Equal(InputType.String, rows[0][2].GetInputType()); + // Assert.Equal(InputType.String, rows[0][3].GetInputType()); + // Assert.Equal(InputType.String, rows[0][4].GetInputType()); + // Assert.Equal(InputType.String, rows[0][5].GetInputType()); + // Assert.Equal(InputType.Integer, rows[1][0].GetInputType()); + // Assert.Equal(InputType.String, rows[1][1].GetInputType()); + // Assert.Equal(InputType.String, rows[1][2].GetInputType()); + // Assert.Equal(InputType.Integer, rows[2][0].GetInputType()); + // Assert.Equal(InputType.String, rows[2][1].GetInputType()); + // Assert.Equal(InputType.Integer, rows[2][2].GetInputType()); + // Assert.Equal(InputType.String, rows[2][3].GetInputType()); + // Assert.Equal(InputType.String, rows[2][4].GetInputType()); + foreach (IToken token in tokenRow) + { + if (token == null) + { + throw new Exception("No token was provided, but token was expected!"); + } + IValueToken? valueToken = token as IValueToken; + if (valueToken == null) + { + throw new Exception("Provided token is not a ValueToken"); + } + returnData.Add(valueToken.GetValue()); + } + } + return returnData; + } + public List AsRows() { var listRows = this.AsListRows(); @@ -116,4 +196,22 @@ public class TokenConverter { return this.rawTokens; } + + public List AsFragments() + { + var items = this.AsSingleStream(); + var newList = new List(); + + foreach (var item in items) + { + var typedItem = item as Fragment; + if (typedItem == null) + { + throw new Exception("Invalid token type encountered"); + } + newList.Add(typedItem); + } + + return newList; + } } diff --git a/TextParser/Tokenization/FragmentToken.cs b/TextParser/Tokenization/FragmentToken.cs new file mode 100644 index 0000000..bdf0f74 --- /dev/null +++ b/TextParser/Tokenization/FragmentToken.cs @@ -0,0 +1,38 @@ +namespace Parsing.Tokenization; + +using Parsing.Schema; + +public class Fragment : Dictionary> +{ +} + +public class FragmentToken : IValueToken +{ + private string word; + private Fragment matches = new Fragment(); + + public FragmentToken(string word) + { + this.word = word; + } + + public string GetText() + { + return word; + } + + public void AddMatch(string name, List values) + { + this.matches.Add(name, values); + } + + public Fragment GetValue() + { + return this.matches; + } + + public InputType GetInputType() + { + return InputType.Fragment; + } +} diff --git a/TextParser/Tokenization/InputProvider.cs b/TextParser/Tokenization/InputProvider.cs index fbc5964..aacda6e 100644 --- a/TextParser/Tokenization/InputProvider.cs +++ b/TextParser/Tokenization/InputProvider.cs @@ -32,6 +32,12 @@ public class InputProvider this.CurrentPosition = 0; } + public InputProvider(string text) + { + this.words = text.Split("\n"); + this.CurrentPosition = 0; + } + public InputProvider.LookaheadContext GetLookaheadContext() { return new InputProvider.LookaheadContext(this);