From 0d85132a326ebaac3fa3ca1193b495a8b5105837 Mon Sep 17 00:00:00 2001 From: Simon Diesenreiter Date: Mon, 2 Dec 2024 15:30:07 +0100 Subject: [PATCH] feat: implement greedy repetition, ref: A24-13 --- TextParser.Tests/TextParserTests.cs | 42 ++++++++++++++ .../BuildingBlocks/BuildingBlockBase.cs | 4 +- .../BuildingBlocks/FixedRepetitionBlock.cs | 17 +++++- .../BuildingBlocks/GreedyRepetitionBlock.cs | 57 +++++++++++++++++++ .../Schema/BuildingBlocks/IBuildingBlock.cs | 4 +- .../Schema/BuildingBlocks/IntegerBlock.cs | 10 +++- .../Schema/BuildingBlocks/StringBlock.cs | 18 ++++++ TextParser/Schema/InputSchema.cs | 12 +++- TextParser/Schema/InputSchemaBuilder.cs | 12 ++++ TextParser/TokenConverter.cs | 21 +++++++ TextParser/Tokenization/InputProvider.cs | 7 ++- 11 files changed, 196 insertions(+), 8 deletions(-) create mode 100644 TextParser/Schema/BuildingBlocks/GreedyRepetitionBlock.cs diff --git a/TextParser.Tests/TextParserTests.cs b/TextParser.Tests/TextParserTests.cs index 2e8d64b..75e6372 100644 --- a/TextParser.Tests/TextParserTests.cs +++ b/TextParser.Tests/TextParserTests.cs @@ -12,6 +12,9 @@ public class TextParserTests private const string testInput3 = @"2 4 6 1 3 5 7 2 4 6 8 3"; + private const string testInput4 = @"2 ab ba fd er sd + 8 cd dc + 7 uh 6 yp rt"; [Fact] public void LineParser_TestSimpleRepetition() @@ -157,4 +160,43 @@ public class TextParserTests Assert.Equal(2, columns[3][1]); Assert.Equal(3, columns[3][2]); } + + [Fact] + public void TextParser_TestGreedyRepetitionAsRows() + { + var schemaBuilder = new InputSchemaBuilder(); + var schema = schemaBuilder + .Repeat() + .Expect(InputType.Integer) + .Repeat() + .Expect(InputType.String) + .EndRepetition() + .EndRepetition() + .Build(); + + var parser = new TextParser(schema); + var rows = parser + .SetInputText(testInput4) + .Parse() + .AsRawData(); + + Assert.Equal(3, rows.Count); + Assert.Equal(6, rows[0].Count); + Assert.Equal(3, rows[1].Count); + Assert.Equal(5, rows[2].Count); + Assert.Equal(InputType.Integer, rows[0][0].GetInputType()); + Assert.Equal(InputType.String, rows[0][1].GetInputType()); + Assert.Equal(InputType.String, rows[0][2].GetInputType()); + Assert.Equal(InputType.String, rows[0][3].GetInputType()); + Assert.Equal(InputType.String, rows[0][4].GetInputType()); + Assert.Equal(InputType.String, rows[0][5].GetInputType()); + Assert.Equal(InputType.Integer, rows[1][0].GetInputType()); + Assert.Equal(InputType.String, rows[1][1].GetInputType()); + Assert.Equal(InputType.String, rows[1][2].GetInputType()); + Assert.Equal(InputType.Integer, rows[2][0].GetInputType()); + Assert.Equal(InputType.String, rows[2][1].GetInputType()); + Assert.Equal(InputType.Integer, rows[2][2].GetInputType()); + Assert.Equal(InputType.String, rows[2][3].GetInputType()); + Assert.Equal(InputType.String, rows[2][4].GetInputType()); + } } diff --git a/TextParser/Schema/BuildingBlocks/BuildingBlockBase.cs b/TextParser/Schema/BuildingBlocks/BuildingBlockBase.cs index f046678..0931f32 100644 --- a/TextParser/Schema/BuildingBlocks/BuildingBlockBase.cs +++ b/TextParser/Schema/BuildingBlocks/BuildingBlockBase.cs @@ -12,6 +12,8 @@ abstract class BuildingBlockBase : IBuildingBlock public abstract bool CanParseWord(InputProvider inputs); + public abstract bool CanParseWord(string word); + public abstract BlockType GetBlockType(); public virtual bool IsRepetitionType() @@ -19,7 +21,7 @@ abstract class BuildingBlockBase : IBuildingBlock return false; } - public virtual bool CheckIsDoneParsingAndReset() + public virtual bool CheckIsDoneParsingAndReset(InputProvider inputs) { // most blocks are always done parsing after consuming a token // repetition blocks can consume multiple tokens diff --git a/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs b/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs index 1a057f2..33be031 100644 --- a/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs +++ b/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs @@ -48,6 +48,21 @@ class FixedRepetitionBlock : BuildingBlockBase return result; } + public override bool CanParseWord(string word) + { + bool result; + if (this.repetitionCount == 0) + { + result = false; + } + else + { + result = inputSchema.CanProcessNextWord(context, word); + } + + return result; + } + public override BlockType GetBlockType() { return BlockType.FixedRepetition; @@ -58,7 +73,7 @@ class FixedRepetitionBlock : BuildingBlockBase return true; } - public override bool CheckIsDoneParsingAndReset() + public override bool CheckIsDoneParsingAndReset(InputProvider inputs) { // we are done parsing once all repetitions are exhausted var result = this.repetitionCount == 0; diff --git a/TextParser/Schema/BuildingBlocks/GreedyRepetitionBlock.cs b/TextParser/Schema/BuildingBlocks/GreedyRepetitionBlock.cs new file mode 100644 index 0000000..59c9260 --- /dev/null +++ b/TextParser/Schema/BuildingBlocks/GreedyRepetitionBlock.cs @@ -0,0 +1,57 @@ +namespace Parsing.Schema.BuildingBlocks; + +using System.IO.Pipelines; +using Parsing.Tokenization; + +class GreedyRepetitionBlock : BuildingBlockBase +{ + private InputSchema inputSchema; + private InputSchemaContext context; + + public GreedyRepetitionBlock(InputSchema inputSchema) + { + this.inputSchema = inputSchema; + this.context = this.inputSchema.CreateContext(); + } + + public override IToken ParseWord(InputProvider inputs) + { + var result = inputSchema.ProcessNextWord(context, inputs); + if (!this.CanParseWord(inputs)) + { + this.context = this.inputSchema.CreateContext(); + } + return result; + } + + public override bool CanParseWord(InputProvider inputs) + { + return inputSchema.CanProcessNextWord(context, inputs) && inputs.CanYieldWord(); + } + + public override bool CanParseWord(string word) + { + return inputSchema.CanProcessNextWord(context, word); + } + + public override BlockType GetBlockType() + { + return BlockType.GreedyRepetition; + } + + public override bool IsRepetitionType() + { + return true; + } + + public override bool CheckIsDoneParsingAndReset(InputProvider inputs) + { + // we are done parsing greedily once the next token doesn't match anymore + var result = !this.CanParseWord(inputs); + if (result) + { + this.context = this.inputSchema.CreateContext(); + } + return result; + } +} \ No newline at end of file diff --git a/TextParser/Schema/BuildingBlocks/IBuildingBlock.cs b/TextParser/Schema/BuildingBlocks/IBuildingBlock.cs index d6b3339..5ccb8c8 100644 --- a/TextParser/Schema/BuildingBlocks/IBuildingBlock.cs +++ b/TextParser/Schema/BuildingBlocks/IBuildingBlock.cs @@ -8,9 +8,11 @@ public interface IBuildingBlock public bool CanParseWord(InputProvider inputs); + public bool CanParseWord(string word); + public BlockType GetBlockType(); public bool IsRepetitionType(); - public bool CheckIsDoneParsingAndReset(); + public bool CheckIsDoneParsingAndReset(InputProvider inputs); } \ No newline at end of file diff --git a/TextParser/Schema/BuildingBlocks/IntegerBlock.cs b/TextParser/Schema/BuildingBlocks/IntegerBlock.cs index ebaf4c0..5db0f7e 100644 --- a/TextParser/Schema/BuildingBlocks/IntegerBlock.cs +++ b/TextParser/Schema/BuildingBlocks/IntegerBlock.cs @@ -18,12 +18,16 @@ class IntegerBlock : BuildingBlockBase { using (inputs.GetLookaheadContext()) { - int number = 0; - var success = int.TryParse(inputs.YieldWord(), out number); - return success; + return this.CanParseWord(inputs.YieldWord()); } } + public override bool CanParseWord(string word) + { + int number = 0; + return int.TryParse(word, out number); + } + public override BlockType GetBlockType() { return BlockType.Integer; diff --git a/TextParser/Schema/BuildingBlocks/StringBlock.cs b/TextParser/Schema/BuildingBlocks/StringBlock.cs index dea4ae7..881e268 100644 --- a/TextParser/Schema/BuildingBlocks/StringBlock.cs +++ b/TextParser/Schema/BuildingBlocks/StringBlock.cs @@ -16,6 +16,24 @@ class StringBlock : BuildingBlockBase public override bool CanParseWord(InputProvider inputs) { + string word = string.Empty; + using (inputs.GetLookaheadContext()) + { + word = inputs.YieldWord(); + } + + return this.CanParseWord(word); + } + + public override bool CanParseWord(string word) + { + // Here we need to ensure we are not matching any non-string tokens, since string can match pretty much anything + IntegerBlock intBlock = new IntegerBlock(); + if(intBlock.CanParseWord(word)) + { + return false; + } + return true; } diff --git a/TextParser/Schema/InputSchema.cs b/TextParser/Schema/InputSchema.cs index dd5c576..7d9041e 100644 --- a/TextParser/Schema/InputSchema.cs +++ b/TextParser/Schema/InputSchema.cs @@ -29,7 +29,7 @@ public class InputSchema { var nextBlock = this.buildingBlocks[currentContext.lastProcessedBlockIndex]; var token = nextBlock.ParseWord(inputs); - if (!nextBlock.IsRepetitionType() || nextBlock.CheckIsDoneParsingAndReset()) + if (!nextBlock.IsRepetitionType() || nextBlock.CheckIsDoneParsingAndReset(inputs)) { currentContext.lastProcessedBlockIndex++; currentContext.HasFinished = currentContext.lastProcessedBlockIndex >= this.buildingBlocks.Count; @@ -50,6 +50,16 @@ public class InputSchema } } + public bool CanProcessNextWord(InputSchemaContext currentContext, string word) + { + if (currentContext.HasFinished) + { + return false; + } + var nextBlock = this.buildingBlocks[currentContext.lastProcessedBlockIndex]; + return nextBlock.CanParseWord(word); + } + public List ProcessWordList(string[] words) { List tokens = new List(); diff --git a/TextParser/Schema/InputSchemaBuilder.cs b/TextParser/Schema/InputSchemaBuilder.cs index 4114ad2..417b744 100644 --- a/TextParser/Schema/InputSchemaBuilder.cs +++ b/TextParser/Schema/InputSchemaBuilder.cs @@ -38,6 +38,15 @@ public class InputSchemaBuilder return newInputSchemaBuilder; } + public InputSchemaBuilder Repeat() + { + // add another layer of parsing + var newInputSchemaBuilder = new RepetitionSchemaBuilder(this); + newInputSchemaBuilder.RepetitionType = RepetitionType.GreedyRepetition; + + return newInputSchemaBuilder; + } + public InputSchemaBuilder EndRepetition() { // return back to upper layer of parsing @@ -54,6 +63,9 @@ public class InputSchemaBuilder case RepetitionType.FixedRepetition: oldInputSchemaBuilder.schema.AddBuildingBlock(new FixedRepetitionBlock(currentSchema, currentBuilder.NumRepetition)); break; + case RepetitionType.GreedyRepetition: + oldInputSchemaBuilder.schema.AddBuildingBlock(new GreedyRepetitionBlock(currentSchema)); + break; default: throw new Exception("Unrecognized RepetitionType"); } diff --git a/TextParser/TokenConverter.cs b/TextParser/TokenConverter.cs index 4ec1f1c..c24d836 100644 --- a/TextParser/TokenConverter.cs +++ b/TextParser/TokenConverter.cs @@ -38,6 +38,20 @@ public class TokenConverter return returnData; } + private void CheckConversionPrerequisites() + { + // in order to convert rows to columns or grid we require every row to have the same length + int rowLength = this.rawTokens[0].Count; + + foreach(var tokenRow in this.rawTokens) + { + if(tokenRow.Count != rowLength) + { + throw new Exception("Attempted to convert token dataset that is not able to be converted!"); + } + } + } + public List AsRows() { var listRows = this.AsListRows(); @@ -71,6 +85,7 @@ public class TokenConverter public List> AsListColumns() { + this.CheckConversionPrerequisites(); var rows = AsListRows(); var columns = new List>(); @@ -92,7 +107,13 @@ public class TokenConverter public T[][] AsGrid() { + this.CheckConversionPrerequisites(); var rowsList = AsRows(); return rowsList.ToArray(); } + + public List> AsRawData() + { + return this.rawTokens; + } } diff --git a/TextParser/Tokenization/InputProvider.cs b/TextParser/Tokenization/InputProvider.cs index 7494bb2..fbc5964 100644 --- a/TextParser/Tokenization/InputProvider.cs +++ b/TextParser/Tokenization/InputProvider.cs @@ -37,9 +37,14 @@ public class InputProvider return new InputProvider.LookaheadContext(this); } + public bool CanYieldWord() + { + return this.CurrentPosition < this.words.Length; + } + public string YieldWord() { - if (this.CurrentPosition > this.words.Length) + if (!this.CanYieldWord()) { return string.Empty; }